diff --git a/.appveyor.yml b/.appveyor.yml index f2cd428b4..9720dc2dd 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -26,7 +26,7 @@ install: - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - set MINGWPREFIX=x86_64-w64-mingw32 - - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\"" + - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\"" # The user may have e.g. jkbonfield/bcftools branch FOO and an associated # jkbonfield/htslib branch FOO. If so use that related htslib, obtained by diff --git a/.cirrus.yml b/.cirrus.yml index a3e46bbe8..e9c44d427 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -109,11 +109,11 @@ ubuntu_task: << : *TEST -# CentOS -centos_task: - name: centos-gcc +# Rocky Linux +rockylinux_task: + name: rockylinux-gcc container: - image: centos:latest + image: rockylinux:latest cpu: 2 memory: 1G diff --git a/INSTALL b/INSTALL index 2bd036a62..c4af072a8 100644 --- a/INSTALL +++ b/INSTALL @@ -218,16 +218,22 @@ Note: libcurl4-openssl-dev can be used as an alternative to libcurl4-gnutls-dev. RedHat / CentOS --------------- +Note: To install gsl-devel, it may be necessary to enable the "crb" repository. +dnf --enablerepo=crb install gsl-devel + sudo yum install autoconf automake make gcc perl-Data-Dumper zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel gsl-devel perl-ExtUtils-Embed +Note: On some versions, Perl FindBin will need to be installed to make the tests work. +sudo yum install perl-FindBin + Alpine Linux ------------ Note: To install gsl-dev, it may be necessary to enable the "community" repository in /etc/apk/repositories. -sudo apk update # Ensure the package list is up to date -sudo apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev +doas apk update # Ensure the package list is up to date +doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev gsl-dev perl-dev OpenSUSE -------- @@ -240,4 +246,23 @@ MacOS, assuming Xcode is installed: xz gsl (optional) +Windows MSYS2/MINGW64 +--------------------- + +The configure script must be used as without it the compilation will +likely fail. + +Follow MSYS2 installation instructions at +https://www.msys2.org/wiki/MSYS2-installation/ + +Then relaunch to MSYS2 shell using the "MSYS2 MinGW x64" executable. +Once in that environment (check $MSYSTEM equals "MINGW64") install the +compilers using pacman -S and the following package list: + +base-devel mingw-w64-x86_64-toolchain +mingw-w64-x86_64-libdeflate mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 +mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools +mingw-w64-x86_64-tools-git + +(The last is only needed for building libraries compatible with MSVC.) diff --git a/Makefile b/Makefile index b8430a95c..c221be2a4 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ OBJS = main.o vcfindex.o tabix.o \ vcfstats.o vcfisec.o vcfmerge.o vcfquery.o vcffilter.o filter.o vcfsom.o \ vcfnorm.o vcfgtcheck.o vcfview.o vcfannotate.o vcfroh.o vcfconcat.o \ vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \ - vcfcnv.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ + vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o abuf.o \ @@ -104,7 +104,7 @@ endif include config.mk -PACKAGE_VERSION = 1.14 +PACKAGE_VERSION = 1.15 # If building from a Git repository, replace $(PACKAGE_VERSION) with the Git # description of the working tree: either a release tag with the same value @@ -217,7 +217,7 @@ bcftools: $(OBJS) $(HTSLIB) plugins: $(PLUGINS) -bcftools_h = bcftools.h $(htslib_hts_defs_h) $(htslib_vcf_h) +bcftools_h = bcftools.h $(htslib_hts_defs_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) call_h = call.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) vcmp.h variantkey_h = variantkey.h hex.h convert_h = convert.h $(htslib_vcf_h) @@ -240,15 +240,16 @@ vcfplugin.o: vcfplugin.c config.h $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(bcftools_h) $(call_h) $(prob1_h) $(ploidy_h) $(gvcf_h) regidx.h $(vcfbuf_h) vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(bcftools_h) vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h) -vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h +vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h regidx.h vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(bcftools_h) extsort.h vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h) vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h) vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h -vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) +vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h) vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h) vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h +vcfhead.o: vcfhead.c $(htslib_kstring_h) $(htslib_vcf_h) $(bcftools_h) vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) kheap.h $(bcftools_h) vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h dist.h diff --git a/NEWS b/NEWS index 7ea362c3f..173ea986c 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,106 @@ +## Release 1.15 (21st February 2022) + + +* New `bcftools head` subcommand for conveniently displaying the headers + of a VCF or BCF file. Without any options, this is equivalent to + `bcftools view --header-only --no-version` but more succinct and memorable. + +* The `-T, --targets-file` option had the following bug originating in HTSlib code: + when an uncompressed file with multiple columns CHR,POS,REF was provided, the + REF would be interpreted as 0 gigabases (#1598) + +Changes affecting specific commands: + +* bcftools annotate + + - In addition to `--rename-annots`, which requires a file with name mappings, + it is now possible to do the same on the command line `-c NEW_TAG:=OLD_TAG` + + - Add new option --min-overlap which allows to specify the minimum required + overlap of intersecting regions + + - Allow to transfer ALT from VCF with or without replacement using + bcftools annotate -a annots.vcf.gz -c ALT file.vcf.gz + bcftools annotate -a annots.vcf.gz -c +ALT file.vcf.gz + +* bcftools convert + + - Revamp of `--gensample`, `--hapsample` and `--haplegendsample` family of options + which includes the following changes: + + - New `--3N6` option to output/input the new version of the .gen file format, + see https://www.cog-genomics.org/plink/2.0/formats#gen + + - Deprecate the `--chrom` option in favor of `--3N6`. A simple `cut` command + can be used to convert from the new 3*M+6 column format to the format printed + with `--chrom` (`cut -d' ' -f1,3-`). + + - The CHROM:POS_REF_ALT IDs which are used to detect strand swaps are required + and must appear either in the "SNP ID" column or the "rsID" column. The column + is autodetected for `--gensample2vcf`, can be the first or the second for + `--hapsample2vcf` (depending on whether the `--vcf-ids` option is given), must be + the first for `--haplegendsample2vcf`. + +* bcftools csq + + - Allow GFF files with phase column unset + +* bcftools filter + + - New `--mask`, `--mask-file` and `--mask-overlap` options to soft filter + variants in regions (#1635) + +* bcftools +fixref + + - The `-m id` option now works also for non-dbSNP ids, i.e. not just `rsINT` + + - New `-m flip-all` mode for flipping all sites, including ambiguous A/T and C/G sites + +* bcftools isec + + - Prevent segfault on sites filtered with -i/-e in all files (#1632) + +* bcftools mpileup + + - More flexible read filtering using the options + --ls, --skip-all-set .. skip reads with all of the FLAG bits set + --ns, --skip-any-set .. skip reads with any of the FLAG bits set + --lu, --skip-all-unset .. skip reads with all of the FLAG bits unset + --nu, --skip-any-unset .. skip reads with any of the FLAG bits unset + + The existing synonymous options will continue to function but their use + is discouraged + --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset + --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set + +* bcftools query + + - Make the `--samples` and `--samples-file` options work also in the `--list-samples` + mode. Add a new `--force-samples` option which allows to proceed even when some of + the requested samples are not present in the VCF (#1631) + +* bcftools +setGT + + - Fix a bug in `-t q -e EXPR` logic applied on FORMAT fields, sites with all + samples failing the expression EXPR were incorrectly skipped. This problem + affected only the use of `-e` logic, not the `-i` expressions (#1607) + +* bcftools sort + + - make use of the TMPDIR environment variable when defined + +* bcftools +trio-dnm2 + + - The --use-NAIVE mode now also adds the de novo allele in FORMAT/VA + + ## Release 1.14 (22nd October 2021) Changes affecting the whole of bcftools, or multiple commands: * New `--regions-overlap` and `--targets-overlap` options which address - a long-standing design problem with subsetting VCF files by region. + a long-standing design problem with subsetting VCF files by region. BCFtools recognize two sets of options, one for streaming (`-t/-T`) and one for index-gumping (`-r/-R`). They behave differently, the first includes only records with POS coordinate within the regions, the other @@ -32,11 +128,11 @@ Changes affecting specific commands: by using `-c INFO/END`. - add a new '.' modifier to control wheter missing values should be carried - over from a tab-delimited file or not. For example: + over from a tab-delimited file or not. For example: -c TAG .. adds TAG if the source value is not missing. If TAG exists in the target file, it will be overwritten - + -c .TAG .. adds TAG even if the source value is missing. This can overwrite non-missing values with a missing value and can create empty VCF fields (`TAG=.`) @@ -165,7 +261,7 @@ Changes affecting specific commands: * bcftools +fill-tags: - Generalization and better support for custom functions that allow - adding new INFO tags based on arbitrary `-i, --include` type of + adding new INFO tags based on arbitrary `-i, --include` type of expressions. For example, to calculate a missing INFO/DP annotation from FORMAT/AD, it is possible to use: @@ -229,7 +325,7 @@ Changes affecting specific commands: - Atomization of AD and QS tags now correctly updates occurrences of duplicate alleles within different haplotypes - + - Fix a bug in atomization of Number=A,R tags * bcftools reheader: @@ -241,7 +337,7 @@ Changes affecting specific commands: - A wider range of genotypes can be set by the plugin by allowing specifying custom genotypes. For example, to force a heterozygous genotype it is now possible to use expressions like: - + c:'m|M' c:0/1 c:0 @@ -253,7 +349,7 @@ Changes affecting specific commands: - Better handling of ambiguous keys such as INFO/AF and CSQ/AD. The `-p, --annot-prefix` option is now applied before doing anything else which allows its use with `-f, --format` and `-c, --columns` options. - + - Some consequence field names may not constitute a valid tag name, such as "pos(1-based)". Newly field names are trimmed to exclude brackets. @@ -383,7 +479,7 @@ Changes affecting specific commands: * bcftools csq: - - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too + - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too many per-sample consequences - Fix a bug which incorrectly handled the --ncsq parameter and could clash diff --git a/bam_sample.c b/bam_sample.c index a6da9432f..d8c10b8b3 100644 --- a/bam_sample.c +++ b/bam_sample.c @@ -1,7 +1,7 @@ /* bam_sample.c -- group data by sample. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2013, 2016-2018 Genome Research Ltd. + Copyright (C) 2013, 2016-2022 Genome Research Ltd. Author: Heng Li , Petr Danecek @@ -281,7 +281,7 @@ int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file) int i, nsamples = 0; char **samples = hts_readlist(list, is_file, &nsamples); - if ( !nsamples ) return 0; + if ( !samples || !nsamples ) return 0; kstring_t ori = {0,0,0}; kstring_t ren = {0,0,0}; @@ -328,7 +328,7 @@ int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file) int i, nrows = 0; char **rows = hts_readlist(list, is_file, &nrows); - if ( !nrows ) return 0; + if ( !rows || !nrows ) return 0; kstring_t fld1 = {0,0,0}; kstring_t fld2 = {0,0,0}; diff --git a/bcftools.h b/bcftools.h index b188e9805..a915802a8 100644 --- a/bcftools.h +++ b/bcftools.h @@ -1,6 +1,6 @@ /* bcftools.h -- utility function declarations. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -28,6 +28,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #define FT_TAB_TEXT 0 // custom tab-delimited text file @@ -50,9 +51,11 @@ void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); -const char *hts_bcf_wmode2(int file_type, char *fname); -void set_wmode(char dst[8], int file_type, char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset +const char *hts_bcf_wmode2(int file_type, const char *fname); +void set_wmode(char dst[8], int file_type, const char *fname, int compression_level); // clevel: 0-9 with or zb type, -1 unset char *init_tmp_prefix(const char *prefix); +int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq); +int parse_overlap_option(const char *arg); void *smalloc(size_t size); // safe malloc diff --git a/bin.c b/bin.c index a4817cf45..645012ec8 100644 --- a/bin.c +++ b/bin.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016 Genome Research Ltd. + Copyright (c) 2016-2022 Genome Research Ltd. Author: Petr Danecek @@ -43,6 +43,7 @@ bin_t *bin_init(const char *list_def, float min, float max) int is_file = strchr(list_def,',') ? 0 : 1; int i, nlist; char **list = hts_readlist(list_def, is_file, &nlist); + if ( !list ) error("Error: failed to read %s\n",list_def); bin->nbins = nlist; bin->bins = (float*) malloc(sizeof(float)*nlist); for (i=0; irlen > args->fa_buf.l - idx ) { rec->rlen = args->fa_buf.l - idx; - alen = strlen(alt_allele); - if ( alen > rec->rlen ) + if ( alt_allele[0]!='<' ) { - alt_allele[rec->rlen] = 0; - fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + alen = strlen(alt_allele); + if ( alen > rec->rlen ) + { + fprintf(stderr,"Warning: trimming variant \"%s\" starting at %s:%"PRId64"\n", alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + alt_allele[rec->rlen] = 0; + } } } if ( idx>=args->fa_buf.l ) @@ -749,7 +752,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG if ( strcasecmp(alt_allele,"") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"") ) - error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n" + error("Symbolic alleles other than , <*> or are currently not supported, e.g. \"%s\" at %s:%"PRId64".\n" "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n", alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); if ( !strcasecmp(alt_allele,"") ) diff --git a/csq.c b/csq.c index 6217987e5..de0d7a9bb 100644 --- a/csq.c +++ b/csq.c @@ -331,6 +331,7 @@ const char *csq_strings[] = #define GF_UTR5 ((1<<(GF_coding_bit+1))+4) // GF_MAX = (1<<30)-1, see hap_node_t +#define CDS_PHASE_UNKN 3 typedef struct _tscript_t tscript_t; typedef struct { @@ -340,7 +341,7 @@ typedef struct // update hap_node_t.sbeg in hap_init, could be calculated on the fly) uint32_t len; // exon length uint32_t icds:30, // exon index within the transcript - phase:2; // offset of the CDS + phase:2; // offset of the CDS: 0,1,2 or 3 for unknown } gf_cds_t; typedef struct @@ -517,7 +518,7 @@ typedef struct uint32_t end; uint32_t trid; uint32_t strand:1; // STRAND_REV,STRAND_FWD - uint32_t phase:2; // 0, 1 or 2 + uint32_t phase:2; // 0, 1, 2, or 3 for unknown uint32_t iseq:29; } ftr_t; @@ -1051,7 +1052,7 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) if ( *ss == '0' ) ftr->phase = 0; else if ( *ss == '1' ) ftr->phase = 1; else if ( *ss == '2' ) ftr->phase = 2; - else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase + else if ( *ss == '.' ) ftr->phase = CDS_PHASE_UNKN; // exons and even CDS in some GFFs do not have phase else { if ( args->verbosity > 0 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; } ss += 2; @@ -1132,6 +1133,7 @@ void tscript_init_cds(args_t *args) // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds) khint_t k; + int warn_phase_unkn = 0; for (k=0; kid2tr); k++) { if ( !kh_exist(aux->id2tr, k) ) continue; @@ -1151,28 +1153,38 @@ void tscript_init_cds(args_t *args) int i, len = 0; if ( tr->strand==STRAND_FWD ) { - if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; - tr->cds[0]->beg += tr->cds[0]->phase; - tr->cds[0]->len -= tr->cds[0]->phase; - tr->cds[0]->phase = 0; + if ( tr->cds[0]->phase != CDS_PHASE_UNKN ) + { + if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME; + tr->cds[0]->beg += tr->cds[0]->phase; + tr->cds[0]->len -= tr->cds[0]->phase; + tr->cds[0]->phase = 0; + } // sanity check phase; the phase number in gff tells us how many bases to skip in this // feature to reach the first base of the next codon int tscript_ok = 1; for (i=0; incds; i++) { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + warn_phase_unkn = 1; + len += tr->cds[i]->len; + continue; + } int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; - if ( phase!=len%3) + if ( phase!=len%3 ) { if ( args->force ) { if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); tscript_ok = 0; break; } error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", - args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } len += tr->cds[i]->len; } @@ -1180,33 +1192,43 @@ void tscript_init_cds(args_t *args) } else { - // Check that the phase is not bigger than CDS length. Curiously, this can really happen, - // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 - // todo: the same for the fwd strand - i = tr->ncds - 1; - int phase = tr->cds[i]->phase; - if ( phase ) tr->trim |= TRIM_5PRIME; - while ( i>=0 && phase > tr->cds[i]->len ) + if ( tr->cds[tr->ncds-1]->phase != CDS_PHASE_UNKN ) { - phase -= tr->cds[i]->len; + // Check that the phase is not bigger than CDS length. Curiously, this can really happen, + // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141 + // todo: the same for the fwd strand + i = tr->ncds - 1; + int phase = tr->cds[i]->phase; + if ( phase ) tr->trim |= TRIM_5PRIME; + while ( i>=0 && phase > tr->cds[i]->len ) + { + phase -= tr->cds[i]->len; + tr->cds[i]->phase = 0; + tr->cds[i]->len = 0; + i--; + } + tr->cds[i]->len -= tr->cds[i]->phase; tr->cds[i]->phase = 0; - tr->cds[i]->len = 0; - i--; } - tr->cds[i]->len -= tr->cds[i]->phase; - tr->cds[i]->phase = 0; // sanity check phase int tscript_ok = 1; for (i=tr->ncds-1; i>=0; i--) { + if ( tr->cds[i]->phase == CDS_PHASE_UNKN ) + { + warn_phase_unkn = 1; + len += tr->cds[i]->len; + continue; + } int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0; if ( phase!=len%3) { if ( args->force ) { if ( args->verbosity > 0 ) - fprintf(stderr,"Warning: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); + fprintf(stderr,"Warning: the GFF has inconsistent phase column in transcript %s, skipping. CDS pos=%d: phase!=len%%3 (phase=%d, len=%d)\n", + args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); tscript_ok = 0; break; } @@ -1282,6 +1304,8 @@ void tscript_init_cds(args_t *args) regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]); } } + if ( warn_phase_unkn && args->verbosity > 0 ) + fprintf(stderr,"Warning: encountered CDS with phase column unset, could not verify reading frame\n"); } void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); } @@ -4316,16 +4340,12 @@ int main_csq(int argc, char *argv[]) case 't': targets_list = optarg; break; case 'T': targets_list = optarg; targets_is_file = 1; break; case 4 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 5 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': error("%s",usage()); diff --git a/doc/bcftools.1 b/doc/bcftools.1 index 058f39ad3..a90ef17d9 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -2,12 +2,12 @@ .\" Title: bcftools .\" Author: [see the "AUTHOR(S)" section] .\" Generator: Asciidoctor 2.0.16.dev -.\" Date: 2021-10-22 +.\" Date: 2022-02-21 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2021-10-22" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2022-02-21" "\ \&" "\ \&" .ie \n(.g .ds Aq \(aq .el .ds Aq ' .ss \n[.ss] 0 @@ -51,7 +51,7 @@ standard input (stdin) and outputs to the standard output (stdout). Several commands can thus be combined with Unix pipes. .SS "VERSION" .sp -This manual page was last updated \fB2021\-10\-22\fP and refers to bcftools git version \fB1.14\fP. +This manual page was last updated \fB2022\-02\-21\fP and refers to bcftools git version \fB1.15\fP. .SS "BCF1" .sp The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP @@ -187,6 +187,17 @@ list of available options, run \fBbcftools\fP \fICOMMAND\fP without arguments. . sp -1 . IP \(bu 2.3 .\} +\fBhead\fP .. view VCF/BCF file headers +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +. sp -1 +. IP \(bu 2.3 +.\} \fBindex\fP .. index VCF/BCF .RE .sp @@ -463,21 +474,23 @@ This option requires indexed VCF/BCF files. Note that \fB\-R\fP cannot be used in combination with \fB\-r\fP. .RE .sp -\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +\fB\-\-regions\-overlap\fP \fIpos\fP|\fIrecord\fP|\fIvariant\fP|\fI0\fP|\fI1\fP|\fI2\fP .RS 4 This option controls how overlapping records are determined: -set to \fB0\fP if the VCF record has to have POS inside a region +set to \fBpos\fP or \fB0\fP if the VCF record has to have POS inside a region (this corresponds to the default behavior of \fB\-t/\-T\fP); -set to \fB1\fP if also overlapping records with POS outside a region -should be included (this is the default behavior of \fB\-r/\-R\fP); or set -to \fB2\fP to include only true overlapping variation (compare +set to \fBrecord\fP or \fB1\fP if also overlapping records with POS outside a region +should be included (this is the default behavior of \fB\-r/\-R\fP, and includes indels +with POS at the end of a region, which are technically outside the region); or set +to \fBvariant\fP or \fB2\fP to include only true overlapping variation (compare the full VCF representation "\f(CRTA>T\-\fP" vs the true sequence variation "\f(CRA>\-\fP"). .RE .sp -\fB\-s, \-\-samples\fP [^]\fILIST\fP +\fB\-s, \-\-samples\fP [^]\fILIST\fP .RS 4 Comma\-separated list of samples to include or exclude if prefixed -with "^". +with "^." (Note that when multiple samples are to be excluded, +the "^" prefix is still present only once, e.g. "^SAMPLE1,SAMPLE2".) The sample order is updated to reflect that given on the command line. Note that in general tags such as INFO/AC, INFO/AN, etc are not updated to correspond to the subset samples. \fBbcftools view\fP is the @@ -495,9 +508,9 @@ that command. For example: .fi .if n .RE .sp -\fB\-S, \-\-samples\-file\fP \fIFILE\fP +\fB\-S, \-\-samples\-file\fP [^]\fIFILE\fP .RS 4 -File of sample names to include or exclude if prefixed with "^". +File of sample names to include or exclude if prefixed with "^". One sample per line. See also the note above for the \fB\-s, \-\-samples\fP option. The sample order is updated to reflect that given in the input file. @@ -541,20 +554,21 @@ The program ignores the first column and the last indicates sex (1=male, 2=femal .fi .if n .RE .sp -\fB\-t, \-\-targets\fP [^]\fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...] +\fB\-t, \-\-targets\fP [^]\fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...] .RS 4 Similar as \fB\-r, \-\-regions\fP, but the next position is accessed by streaming the whole VCF/BCF rather than using the tbi/csi index. Both \fB\-r\fP and \fB\-t\fP options can be applied simultaneously: \fB\-r\fP uses the index to jump to a region and \fB\-t\fP discards positions which are not in the targets. Unlike \fB\-r\fP, targets -can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" +can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. Yet another difference between the \fB\-t/\-T\fP and \fB\-r/\-R\fP is that \fB\-r/\-R\fP checks for proper overlaps and considers both POS and the end position of an indel, while \fB\-t/\-T\fP -considers the POS coordinate only. Note that \fB\-t\fP cannot be used in combination with \fB\-T\fP. +considers the POS coordinate only (by default; see also \fB\-\-regions\-overlap\fP and \fB\-\-targets\-overlap\fP). +Note that \fB\-t\fP cannot be used in combination with \fB\-T\fP. .RE .sp -\fB\-T, \-\-targets\-file\fP [^]\fIFILE\fP +\fB\-T, \-\-targets\-file\fP [^]\fIFILE\fP .RS 4 Same \fB\-t, \-\-targets\fP, but reads regions from a file. Note that \fB\-T\fP cannot be used in combination with \fB\-t\fP. @@ -573,7 +587,7 @@ Such a file can be easily created from a VCF using: .fi .if n .RE .sp -\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +\fB\-\-targets\-overlap\fP \fIpos\fP|\fIrecord\fP|\fIvariant\fP|\fI0\fP|\fI1\fP|\fI2\fP .RS 4 Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP. .RE @@ -635,13 +649,6 @@ See also \fB\-c, \-\-columns\fP and \fB\-h, \-\-header\-lines\fP. .fi .if n .RE .sp -\fB\-\-collapse\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIall\fP|\fIsome\fP|\fInone\fP -.RS 4 -Controls how to match records from the annotation file to the target VCF. -Effective only when \fB\-a\fP is a VCF or BCF. -See \fBCommon Options\fP for more. -.RE -.sp \fB\-c, \-\-columns\fP \fIlist\fP .RS 4 Comma\-separated list of columns or tags to carry over from the annotation file @@ -663,7 +670,7 @@ The imported VCF annotations can be renamed as "DST_TAG:=SRC_TAG" or "FMT/DST_TA \~ .br To carry over all INFO annotations, use "INFO". To add all INFO annotations except -"TAG", use "^INFO/TAG". By default, existing values are replaced. +"TAG", use "^INFO/TAG". By default, existing values are replaced. \~ .br \~ @@ -776,6 +783,15 @@ This is an experimental feature. annotate sites which are present ("+") or absent ("\-") in the \fB\-a\fP file with a new INFO/TAG flag .RE .sp +\fB\-\-min\-overlap\fP \fIANN\fP:\*(AqVCF\*(Aq +.RS 4 +minimum overlap required as a fraction of the variant in the annotation \fB\-a\fP file (\fIANN\fP), in the +target VCF file (\fI:VCF\fP), or both for reciprocal overlap (\fIANN:VCF\fP). +By default overlaps of arbitrary length are sufficient. +The option can be used only with the tab\-delimited annotation \fB\-a\fP file and with \fIBEG\fP and \fIEND\fP +columns present. +.RE +.sp \fB\-\-no\-version\fP .RS 4 see \fBCommon Options\fP @@ -791,6 +807,14 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-pair\-logic\fP \fIsnps\fP|\fIindels\fP|\fIboth\fP|\fIall\fP|\fIsome\fP|\fIexact\fP +.RS 4 +Controls how to match records from the annotation file to the target VCF. +Effective only when \fB\-a\fP is a VCF or BCF. The option replaces the former +uninuitive \fB\-\-collapse\fP. +See \fBCommon Options\fP for more. +.RE +.sp \fB\-r, \-\-regions\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...] .RS 4 see \fBCommon Options\fP @@ -801,6 +825,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-rename\-annots\fP \fIfile\fP .RS 4 rename annotations according to the map in \fIfile\fP, with @@ -816,7 +845,7 @@ rename chromosomes according to the map in \fIfile\fP, with line. .RE .sp -\fB\-s, \-\-samples\fP [^]\fILIST\fP +\fB\-s, \-\-samples\fP [^]\fILIST\fP .RS 4 subset of samples to annotate, see also \fBCommon Options\fP .RE @@ -848,7 +877,7 @@ List of annotations to remove. Use "FILTER" to remove all filters or "FILTER/SomeFilter" to remove a specific filter. Similarly, "INFO" can be used to remove all INFO tags and "FORMAT" to remove all FORMAT tags except GT. To remove all INFO tags except "FOO" and "BAR", use -"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). +"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". .RE .sp @@ -954,6 +983,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-samples\fP \fILIST\fP .RS 4 see \fBCommon Options\fP @@ -974,6 +1008,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-threads\fP \fIINT\fP .RS 4 see \fBCommon Options\fP @@ -990,7 +1029,7 @@ appear in any of the genotypes .RS 4 comma\-separated list of FORMAT fields to output for each sample. Currently GQ and GP fields are supported. For convenience, the fields can be given -as lower case letters. Prefixed with "^" indicates a request for tag +as lower case letters. Prefixed with "^" indicates a request for tag removal of auxiliary tags useful only for calling. .RE .sp @@ -1165,6 +1204,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-query\-sample\fP \fIstring\fP .RS 4 query sample name @@ -1179,6 +1223,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE .SS "HMM Options:" .sp \fB\-a, \-\-aberrant\fP \fIfloat\fP[,\fIfloat\fP] @@ -1343,6 +1392,11 @@ see \fBCommon Options\fP. Requires \fB\-a, \-\-allow\-overlaps\fP. see \fBCommon Options\fP. Requires \fB\-a, \-\-allow\-overlaps\fP. .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-threads\fP \fIINT\fP .RS 4 see \fBCommon Options\fP @@ -1357,6 +1411,11 @@ Note that the program does not act as a primitive variant caller and ignores all depth information, such as INFO/AD or FORMAT/AD. For that, consider using the \fBsetGT\fP plugin. .sp +\fB\-a, \-\-absent\fP \fICHAR\fP +.RS 4 +replace positions absent from VCF with CHAR +.RE +.sp \fB\-c, \-\-chain\fP \fIFILE\fP .RS 4 write a chain file for liftover @@ -1519,6 +1578,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-samples\fP \fILIST\fP .RS 4 see \fBCommon Options\fP @@ -1538,6 +1602,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE .SS "VCF output options:" .sp \fB\-\-no\-version\fP @@ -1563,10 +1632,17 @@ see \fBCommon Options\fP .sp \fB\-G, \-\-gensample2vcf\fP \fIprefix\fP or \fIgen\-file\fP,\fIsample\-file\fP .RS 4 -convert IMPUTE2 output to VCF. The second column must be of the form -"CHROM:POS_REF_ALT" to detect possible strand swaps; IMPUTE2 leaves the -first one empty ("\-\-") when sites from reference panel are filled in. See -also \fB\-g\fP below. +convert IMPUTE2 output to VCF. One of the ID columns ("SNP ID" or "rsID" in +.URL "https://www.cog\-genomics.org/plink/2.0/formats#gen" "" ")" +must be of the form +"CHROM:POS_REF_ALT" to detect possible strand swaps. +\~ +.br +When the \fB\-\-vcf\-ids\fP option is given, the other column (autodetected) is used +to fill the ID column of the VCF. +\~ +.br +See also \fB\-g\fP and \fB\-\-3N6\fP options. .RE .sp \fB\-g, \-\-gensample\fP \fIprefix\fP or \fIgen\-file\fP,\fIsample\-file\fP @@ -1575,14 +1651,31 @@ convert from VCF to gen/sample format used by IMPUTE2 and SHAPEIT. The columns of .gen file format are ID1,ID2,POS,A,B followed by three genotype probabilities P(AA), P(AB), P(BB) for each sample. In order to prevent strand swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". -For example: +When the \fB\-\-vcf\-ids\fP option is given, the second column is set to match the ID +column of the VCF. +\~ +.br +See also \fB\-G\fP and \fB\-\-3N6\fP options. +\~ +.br +The file .gen and .sample file format are: .RE .sp .if n .RS 4 .nf .fam C - .gen - \-\-\-\- + .gen (with \-\-3N6 \-\-vcf\-ids) + \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- + chr1 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0 + chr1 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1 + + .gen (with \-\-vcf\-ids) + \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- + 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0 + 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1 + + .gen (the default) + \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- 1:111485207_G_A 1:111485207_G_A 111485207 G A 0 1 0 0 1 0 1:111494194_C_T 1:111494194_C_T 111494194 C T 0 1 0 0 0 1 @@ -1596,14 +1689,16 @@ For example: .fi .if n .RE .sp -\fB\-\-tag\fP \fISTRING\fP +\fB\-\-3N6\fP .RS 4 -tag to take values for .gen file: GT,PL,GL,GP +Expect/Create files in the 3*N+6 column format. This is the new .gen file format with the first +column containing the chromosome name, see \c +.URL "https://www.cog\-genomics.org/plink/2.0/formats#gen" "" "" .RE .sp -\fB\-\-chrom\fP +\fB\-\-tag\fP \fISTRING\fP .RS 4 -output chromosome in the first column instead of CHROM:POS_REF_ALT +tag to take values for .gen file: GT,PL,GL,GP .RE .sp \fB\-\-sex\fP \fIFILE\fP @@ -1645,19 +1740,23 @@ reference sequence in fasta format. Must be indexed with samtools faidx .RS 4 convert from hap/sample format to VCF. The columns of .hap file are similar to .gen file above, but there are only two haplotype columns per -sample. Note that the first column of the .hap file is expected to be in -the form "CHR:POS_REF_ALT(_END)?", with the _END being optional for -defining the INFO/END tag when ALT is a symbolic allele, for example: +sample. Note that the first or the second column of the .hap file is expected to be in +the form "CHR:POS_REF_ALT[_END]", with the _END being optional for +defining the INFO/END tag when ALT is a symbolic allele. For example: .RE .sp .if n .RS 4 .nf .fam C - .hap - \-\-\-\- + .hap (with \-\-vcf\-ids) + \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 - 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 1:111495231_A__111495784 rsID3 111495231 A 0 0 1 0 + + .hap (the default) + \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- + 1 1:111485207_G_A 111485207 G A 0 1 0 0 + 1 1:111495231_A__111495784 111495231 A 0 0 1 0 .fam .fi .if n .RE @@ -1694,22 +1793,26 @@ output sex column in the sample file. The FILE format is .sp \fB\-\-vcf\-ids\fP .RS 4 -output VCF IDs instead of "CHROM:POS_REF_ALT" IDs +the second column of the .hap file holds the VCF ids, the first +column is of the form "CHR:POS_REF_ALT[_END]". Without the option, +the format follows \c +.URL "https://www.cog\-genomics.org/plink/2.0/formats#haps" "" +with ids (the second column) of the form "CHR:POS_REF_ALT[_END]" .RE .SS "HAP/LEGEND/SAMPLE conversion:" .sp \fB\-H, \-\-haplegendsample2vcf\fP \fIprefix\fP or \fIhap\-file\fP,\fIlegend\-file\fP,\fIsample\-file\fP .RS 4 -convert from hap/legend/sample format used by IMPUTE2 to VCF, see -also \fB\-h, \-\-hapslegendsample\fP below. +convert from hap/legend/sample format used by IMPUTE2 to VCF. +See also \fB\-h, \-\-hapslegendsample\fP below. .RE .sp \fB\-h, \-\-haplegendsample\fP \fIprefix\fP or \fIhap\-file\fP,\fIlegend\-file\fP,\fIsample\-file\fP .RS 4 convert from VCF to hap/legend/sample format used by IMPUTE2 and SHAPEIT. The columns of .legend file ID,POS,REF,ALT. In order to prevent strand -swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". The .sample -file is quite basic at the moment with columns for population, group and +swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". +The .sample file is quite basic at the moment with columns for population, group and sex expected to be edited by the user. For example: .RE .sp @@ -1760,7 +1863,8 @@ output sex column in the sample file. The FILE format is .sp \fB\-\-vcf\-ids\fP .RS 4 -output VCF IDs instead of "CHROM:POS_REF_ALT" IDs +output VCF IDs instead of "CHROM:POS_REF_ALT". Note that this option can +be used with \fB\-\-haplegendsample\fP but not with \fB\-\-haplegendsample2vcf\fP. .RE .SS "TSV conversion:" .sp @@ -1993,6 +2097,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-samples\fP \fILIST\fP .RS 4 samples to include or "\-" to apply all variants and ignore samples @@ -2013,6 +2122,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fBExamples:\fP .sp .if n .RS 4 @@ -2123,6 +2237,22 @@ include only sites for which \fIEXPRESSION\fP is true. For valid expressions see \fBEXPRESSIONS\fP. .RE .sp +\fB\-\-mask\fP [^]\fIREGION\fP +.RS 4 +Soft filter regions, prepepend "^" to negate. Requires \fB\-s, \-\-soft\-filter\fP. +.RE +.sp +\fB\-M, \-\-mask\-file\fP [^]\fIFILE\fP +.RS 4 +Soft filter regions listed in a file, "^" to negate. Requires \fB\-s, \-\-soft\-filter\fP. +.RE +.sp +\fB\-\-mask\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +Same as \fB\-\-regions\-overlap\fP but for \fB\-\-mask/\-\-mask\-file\fP. +See \fBCommon Options\fP. [1] +.RE +.sp \fB\-m, \-\-mode\fP [\fI+x\fP] .RS 4 define behaviour at sites with existing FILTER annotations. The default @@ -2158,6 +2288,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-soft\-filter\fP \fISTRING\fP|\fI+\fP .RS 4 annotate FILTER column with \fISTRING\fP or, with \fI+\fP, a unique filter name generated @@ -2179,6 +2314,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-threads\fP \fIINT\fP .RS 4 see \fBCommon Options\fP @@ -2262,6 +2402,11 @@ Restrict to comma\-separated list of regions, see \fBCommon Options\fP Restrict to regions listed in a file, see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-samples\fP [\fIqry\fP|\fIgt\fP]:\*(AqLIST\*(Aq: List of query samples or \fB\-g\fP samples. If neither \fB\-s\fP nor \fB\-S\fP are given, all possible sample pair combinations are compared @@ -2280,6 +2425,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-u, \-\-use\fP \fITAG1\fP[,\fITAG2\fP] .RS 4 specifies which tag to use in the query file (\fITAG1\fP) and the \fB\-g\fP (\fITAG2\fP) file. @@ -2303,12 +2453,35 @@ available. .fam .fi .if n .RE +.SS "bcftools head [\fIOPTIONS\fP] [\fIFILE\fP]" +.sp +By default, prints all headers from the specified input file to standard output +in VCF format. The input file may be in VCF or BCF format; if no \fIFILE\fP is +specified, standard input will be read. With appropriate options, only some +of the headers and/or additionally some of the variant records will be printed. +.sp +The \fBbcftools head\fP command outputs VCF headers almost exactly as they appear +in the input file: it may add a \f(CR##FILTER=\fP header if not already +present, but it never adds version or command line information itself. +.SS "Options:" +.sp +\fB\-h, \-\-header\fP \fIINT\fP +.RS 4 +Display only the first \fIINT\fP header lines. +By default, all header lines are displayed. +.RE +.sp +\fB\-n, \-\-records\fP \fIINT\fP +.RS 4 +Also display the first \fIINT\fP variant records. +By default, no variant records are displayed. +.RE .SS "bcftools index [\fIOPTIONS\fP] \fIin.bcf\fP|\fIin.vcf.gz\fP" .sp Creates index for bgzip compressed VCF/BCF files for random access. CSI (coordinate\-sorted index) is created by default. The CSI format -supports indexing of chromosomes up to length 2^31. TBI (tabix index) -index files, which support chromosome lengths up to 2^29, can be +supports indexing of chromosomes up to length 2^31. TBI (tabix index) +index files, which support chromosome lengths up to 2^29, can be created by using the \fI\-t/\-\-tbi\fP option or using the \fItabix\fP program packaged with htslib. When loading an index file, bcftools will try the CSI first and then the TBI. @@ -2326,7 +2499,7 @@ overwrite index if it already exists .sp \fB\-m, \-\-min\-shift \fIINT\fP\fP .RS 4 -set minimal interval size for CSI indices to 2^INT; default: 14 +set minimal interval size for CSI indices to 2^INT; default: 14 .RE .sp \fB\-o, \-\-output \fIFILE\fP\fP @@ -2428,6 +2601,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-t, \-\-targets\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...] .RS 4 see \fBCommon Options\fP @@ -2438,6 +2616,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-w, \-\-write\fP \fILIST\fP .RS 4 list of input files to output given as 1\-based indices. With \fB\-p\fP and no @@ -2633,6 +2816,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-threads\fP \fIINT\fP .RS 4 see \fBCommon Options\fP @@ -2743,7 +2931,7 @@ Do not require the \fB\-\-fasta\-ref\fP option. .sp \fB\-G, \-\-read\-groups\fP \fIFILE\fP .RS 4 -list of read groups to include or exclude if prefixed with "^". +list of read groups to include or exclude if prefixed with "^". One read group per line. This file can also be used to assign new sample names to read groups by giving the new sample name as a second white\-space\-separated field, like this: "read_group_id new_sample_name". @@ -2780,7 +2968,7 @@ Minimum mapping quality for an alignment to be used [0] Minimum base quality for a base to be considered [13] .RE .sp -* \-\-max\-BQ* \fIINT\fP +\fB\-\-max\-BQ\fP \fIINT\fP .RS 4 Caps the base quality to a maximum value [60]. This can be particularly useful on technologies that produce overly optimistic @@ -2801,19 +2989,36 @@ As for \fB\-r, \-\-regions\fP, but regions read from FILE; see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-ignore\-RG\fP .RS 4 Ignore RG tags. Treat all reads in one alignment file as one sample. .RE .sp -\fB\-\-rf, \-\-incl\-flags\fP \fISTR\fP|\fIINT\fP +\fB\-\-ls, \-\-skip\-all\-set\fP +.RS 4 +Skip reads with all of the FLAG bits set [null] +.RE +.sp +\fB\-\-ns, \-\-skip\-any\-set\fP .RS 4 -Required flags: skip reads with mask bits unset [null] +Skip reads with any of the FLAG bits set. This option replaces and +is synonymous to the deprecated \fB\-\-ff, \-\-excl\-flags\fP [UNMAP,SECONDARY,QCFAIL,DUP] .RE .sp -\fB\-\-ff, \-\-excl\-flags\fP \fISTR\fP|\fIINT\fP +\fB\-\-lu, \-\-skip\-all\-unset\fP .RS 4 -Filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP] +Skip reads with all of the FLAG bits unset. This option replaces and +is synonymous to the deprecated \fB\-\-rf, \-\-incl\-flags\fP [null] +.RE +.sp +\fB\-\-nu, \-\-skip\-any\-unset\fP +.RS 4 +Skip reads with any of the FLAG bits unset [null] .RE .sp \fB\-s, \-\-samples\fP \fILIST\fP @@ -2823,7 +3028,7 @@ list of sample names. See \fBCommon Options\fP .sp \fB\-S, \-\-samples\-file\fP \fIFILE\fP .RS 4 -file of sample names to include or exclude if prefixed with "^". +file of sample names to include or exclude if prefixed with "^". One sample per line. This file can also be used to rename samples by giving the new sample name as a second white\-space\-separated column, like this: "old_name new_name". If a sample name contains spaces, the spaces can be @@ -2841,6 +3046,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-x, \-\-ignore\-overlaps\fP .RS 4 Disable read\-pair overlap detection. @@ -3218,6 +3428,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-strict\-filter\fP .RS 4 when merging (\fI\-m+\fP), merged site is PASS only if all sites being merged PASS @@ -3233,6 +3448,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-threads\fP \fIINT\fP .RS 4 see \fBCommon Options\fP @@ -3276,6 +3496,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-t, \-\-targets\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...] .RS 4 see \fBCommon Options\fP @@ -3285,6 +3510,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE .SS "VCF output options:" .sp \fB\-\-no\-version\fP @@ -3867,6 +4097,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-sample\fP \fIstring\fP .RS 4 sample name @@ -3882,6 +4117,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-v, \-\-verbose\fP .RS 4 verbose debugging output which gives hints about the thresholds and decisions made @@ -3936,6 +4176,11 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see \fBEXPRESSIONS\fP. .RE .sp +\fB\-\-force\-samples\fP +.RS 4 +continue even when some samples requested via \fB\-s/\-S\fP do not exist +.RE +.sp \fB\-f, \-\-format\fP \fIFORMAT\fP .RS 4 learn by example, see below @@ -3972,6 +4217,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-samples\fP \fILIST\fP .RS 4 see \fBCommon Options\fP @@ -3992,6 +4242,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-u, \-\-allow\-undef\-tags\fP .RS 4 do not throw an error if there are undefined tags in the format string, @@ -4303,6 +4558,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-samples\fP \fILIST\fP .RS 4 see \fBCommon Options\fP @@ -4322,6 +4582,11 @@ see \fBCommon Options\fP .RS 4 see \fBCommon Options\fP .RE +.sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE .SS "HMM Options:" .sp \fB\-a, \-\-hw\-to\-az\fP \fIFLOAT\fP @@ -4461,6 +4726,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-s, \-\-samples\fP \fILIST\fP .RS 4 see \fBCommon Options\fP @@ -4481,6 +4751,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-u, \-\-user\-tstv\fP \fI\fP .RS 4 collect Ts/Tv stats for any tag using the given binning [0:1:100] @@ -4503,7 +4778,7 @@ drop individual genotype information (after subsetting if \fB\-s\fP option is se .sp \fB\-h, \-\-header\-only\fP .RS 4 -output the VCF header only +output the VCF header only (see also \fBbcftools head\fP) .RE .sp \fB\-H, \-\-no\-header\fP @@ -4546,6 +4821,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-regions\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-t, \-\-targets\fP \fIchr\fP|\fIchr:pos\fP|\fIchr:from\-to\fP|\fIchr:from\-\fP[,...] .RS 4 see \fBCommon Options\fP @@ -4556,6 +4836,11 @@ see \fBCommon Options\fP see \fBCommon Options\fP .RE .sp +\fB\-\-targets\-overlap\fP \fI0\fP|\fI1\fP|\fI2\fP +.RS 4 +see \fBCommon Options\fP +.RE +.sp \fB\-\-threads\fP \fIINT\fP .RS 4 see \fBCommon Options\fP @@ -4637,11 +4922,11 @@ exclude sites for which \fIEXPRESSION\fP is true. For valid expressions see see \fBCommon Options\fP .RE .sp -\fB\-g, \-\-genotype\fP [^][\fIhom\fP|\fIhet\fP|\fImiss\fP] +\fB\-g, \-\-genotype\fP [^][\fIhom\fP|\fIhet\fP|\fImiss\fP] .RS 4 include only sites with one or more homozygous (\fIhom\fP), heterozygous -(\fIhet\fP) or missing (\fImiss\fP) genotypes. When prefixed with \fI^\fP, the logic -is reversed; thus \fI^het\fP excludes sites with heterozygous genotypes. +(\fIhet\fP) or missing (\fImiss\fP) genotypes. When prefixed with \fI^\fP, the logic +is reversed; thus \fI^het\fP excludes sites with heterozygous genotypes. .RE .sp \fB\-i, \-\-include\fP \fIEXPRESSION\fP diff --git a/doc/bcftools.html b/doc/bcftools.html index 5bfc82986..febeaa9c9 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -50,7 +50,7 @@

DESCRIPTION

VERSION

-

This manual page was last updated 2021-10-22 and refers to bcftools git version 1.14.

+

This manual page was last updated 2022-02-21 and refers to bcftools git version 1.15.

@@ -118,6 +118,9 @@

LIST OF COMMANDS

gtcheck .. check sample concordance, detect sample swaps and contamination

  • +

    head .. view VCF/BCF file headers

    +
  • +
  • index .. index VCF/BCF

  • @@ -297,20 +300,22 @@

    Common Options

    This option requires indexed VCF/BCF files. Note that -R cannot be used in combination with -r.

    -
    --regions-overlap 0|1|2
    +
    --regions-overlap pos|record|variant|0|1|2

    This option controls how overlapping records are determined: -set to 0 if the VCF record has to have POS inside a region +set to pos or 0 if the VCF record has to have POS inside a region (this corresponds to the default behavior of -t/-T); -set to 1 if also overlapping records with POS outside a region -should be included (this is the default behavior of -r/-R); or set -to 2 to include only true overlapping variation (compare +set to record or 1 if also overlapping records with POS outside a region +should be included (this is the default behavior of -r/-R, and includes indels +with POS at the end of a region, which are technically outside the region); or set +to variant or 2 to include only true overlapping variation (compare the full VCF representation "TA>T-" vs the true sequence variation "A>-").

    -
    -s, --samples [^]LIST
    +
    -s, --samples [^]LIST

    Comma-separated list of samples to include or exclude if prefixed -with "^". +with "^." (Note that when multiple samples are to be excluded, +the "^" prefix is still present only once, e.g. "^SAMPLE1,SAMPLE2".) The sample order is updated to reflect that given on the command line. Note that in general tags such as INFO/AC, INFO/AN, etc are not updated to correspond to the subset samples. bcftools view is the @@ -328,9 +333,9 @@

    Common Options

  • -
    -S, --samples-file FILE
    +
    -S, --samples-file [^]FILE
    -

    File of sample names to include or exclude if prefixed with "^". +

    File of sample names to include or exclude if prefixed with "^". One sample per line. See also the note above for the -s, --samples option. The sample order is updated to reflect that given in the input file. @@ -370,19 +375,20 @@

    Common Options

    -
    -t, --targets [^]chr|chr:pos|chr:from-to|chr:from-[,…​]
    +
    -t, --targets [^]chr|chr:pos|chr:from-to|chr:from-[,…​]

    Similar as -r, --regions, but the next position is accessed by streaming the whole VCF/BCF rather than using the tbi/csi index. Both -r and -t options can be applied simultaneously: -r uses the index to jump to a region and -t discards positions which are not in the targets. Unlike -r, targets -can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" +can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. Yet another difference between the -t/-T and -r/-R is that -r/-R checks for proper overlaps and considers both POS and the end position of an indel, while -t/-T -considers the POS coordinate only. Note that -t cannot be used in combination with -T.

    +considers the POS coordinate only (by default; see also --regions-overlap and --targets-overlap). +Note that -t cannot be used in combination with -T.

    -
    -T, --targets-file [^]FILE
    +
    -T, --targets-file [^]FILE

    Same -t, --targets, but reads regions from a file. Note that -T cannot be used in combination with -t.

    @@ -402,7 +408,7 @@

    Common Options

    -
    --targets-overlap 0|1|2
    +
    --targets-overlap pos|record|variant|0|1|2

    Same as --regions-overlap but for -t/-T.

    @@ -463,12 +469,6 @@

    bcftools annotate [OPTIONS] FILE

    -
    --collapse snps|indels|both|all|some|none
    -
    -

    Controls how to match records from the annotation file to the target VCF. -Effective only when -a is a VCF or BCF. -See Common Options for more.

    -
    -c, --columns list

    Comma-separated list of columns or tags to carry over from the annotation file @@ -486,7 +486,7 @@

    bcftools annotate [OPTIONS] FILE

     
     
    To carry over all INFO annotations, use "INFO". To add all INFO annotations except -"TAG", use "^INFO/TAG". By default, existing values are replaced. +"TAG", use "^INFO/TAG". By default, existing values are replaced.  
     
    By default, existing tags are overwritten unless the source value is a missing value (i.e. "."). @@ -541,7 +541,7 @@

    bcftools annotate [OPTIONS] FILE

    -
    -I, --set-id [+]FORMAT
    +
    -I, --set-id [+]FORMAT

    assign ID on the fly. The format is the same as in the query command (see below). By default all existing IDs are replaced. If the @@ -585,6 +585,14 @@

    bcftools annotate [OPTIONS] FILE

    annotate sites which are present ("+") or absent ("-") in the -a file with a new INFO/TAG flag

    +
    --min-overlap ANN:'VCF'
    +
    +

    minimum overlap required as a fraction of the variant in the annotation -a file (ANN), in the +target VCF file (:VCF), or both for reciprocal overlap (ANN:VCF). +By default overlaps of arbitrary length are sufficient. +The option can be used only with the tab-delimited annotation -a file and with BEG and END +columns present.

    +
    --no-version

    see Common Options

    @@ -597,6 +605,13 @@

    bcftools annotate [OPTIONS] FILE

    see Common Options

    +
    --pair-logic snps|indels|both|all|some|exact
    +
    +

    Controls how to match records from the annotation file to the target VCF. +Effective only when -a is a VCF or BCF. The option replaces the former +uninuitive --collapse. +See Common Options for more.

    +
    -r, --regions chr|chr:pos|chr:from-to|chr:from-[,…​]

    see Common Options

    @@ -605,6 +620,10 @@

    bcftools annotate [OPTIONS] FILE

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    --rename-annots file

    rename annotations according to the map in file, with @@ -618,7 +637,7 @@

    bcftools annotate [OPTIONS] FILE

    "old_name new_name\n" pairs separated by whitespaces, each on a separate line.

    -
    -s, --samples [^]LIST
    +
    -s, --samples [^]LIST

    subset of samples to annotate, see also Common Options

    @@ -646,7 +665,7 @@

    bcftools annotate [OPTIONS] FILE

    "FILTER/SomeFilter" to remove a specific filter. Similarly, "INFO" can be used to remove all INFO tags and "FORMAT" to remove all FORMAT tags except GT. To remove all INFO tags except "FOO" and "BAR", use -"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). +"^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".

    @@ -750,6 +769,10 @@

    File format options:

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --samples LIST

    see Common Options

    @@ -766,6 +789,10 @@

    File format options:

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    --threads INT

    see Common Options

    @@ -786,7 +813,7 @@

    Input/output options:

    comma-separated list of FORMAT fields to output for each sample. Currently GQ and GP fields are supported. For convenience, the fields can be given -as lower case letters. Prefixed with "^" indicates a request for tag +as lower case letters. Prefixed with "^" indicates a request for tag removal of auxiliary tags useful only for calling.

    -F, --prior-freqs AN,AC
    @@ -956,6 +983,10 @@

    General Options:

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --query-sample string

    query sample name

    @@ -968,6 +999,10 @@

    General Options:

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    @@ -1118,6 +1153,10 @@

    bcftools concat [OPTIONS] FILE1 FILE2

    see Common Options. Requires -a, --allow-overlaps.

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    --threads INT

    see Common Options

    @@ -1138,6 +1177,10 @@

    bcftools consensus [OPTIONS] FILE

    +
    -a, --absent CHAR
    +
    +

    replace positions absent from VCF with CHAR

    +
    -c, --chain FILE

    write a chain file for liftover

    @@ -1280,6 +1323,10 @@

    VCF input options:

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --samples LIST

    see Common Options

    @@ -1296,6 +1343,10 @@

    VCF input options:

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    @@ -1328,10 +1379,14 @@

    GEN/SAMPLE conversion:

    -G, --gensample2vcf prefix or gen-file,sample-file
    -

    convert IMPUTE2 output to VCF. The second column must be of the form -"CHROM:POS_REF_ALT" to detect possible strand swaps; IMPUTE2 leaves the -first one empty ("--") when sites from reference panel are filled in. See -also -g below.

    +

    convert IMPUTE2 output to VCF. One of the ID columns ("SNP ID" or "rsID" in +https://www.cog-genomics.org/plink/2.0/formats#gen) must be of the form +"CHROM:POS_REF_ALT" to detect possible strand swaps. + 
    +When the --vcf-ids option is given, the other column (autodetected) is used +to fill the ID column of the VCF. + 
    +See also -g and --3N6 options.

    -g, --gensample prefix or gen-file,sample-file
    @@ -1339,14 +1394,29 @@

    GEN/SAMPLE conversion:

    The columns of .gen file format are ID1,ID2,POS,A,B followed by three genotype probabilities P(AA), P(AB), P(BB) for each sample. In order to prevent strand swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". -For example:

    +When the --vcf-ids option is given, the second column is set to match the ID +column of the VCF. + 
    +See also -G and --3N6 options. + 
    +The file .gen and .sample file format are:

    -
      .gen
    -  ----
    +
      .gen (with --3N6 --vcf-ids)
    +  ---------------------------
    +  chr1 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0
    +  chr1 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1
    +
    +  .gen (with --vcf-ids)
    +  ---------------------------
    +  1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0
    +  1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1
    +
    +  .gen (the default)
    +  ------------------------------
       1:111485207_G_A 1:111485207_G_A 111485207 G A 0 1 0 0 1 0
       1:111494194_C_T 1:111494194_C_T 111494194 C T 0 1 0 0 0 1
     
    @@ -1360,13 +1430,14 @@ 

    GEN/SAMPLE conversion:

    -
    --tag STRING
    +
    --3N6
    -

    tag to take values for .gen file: GT,PL,GL,GP

    +

    Expect/Create files in the 3*N+6 column format. This is the new .gen file format with the first +column containing the chromosome name, see https://www.cog-genomics.org/plink/2.0/formats#gen

    -
    --chrom
    +
    --tag STRING
    -

    output chromosome in the first column instead of CHROM:POS_REF_ALT

    +

    tag to take values for .gen file: GT,PL,GL,GP

    --sex FILE
    @@ -1416,19 +1487,23 @@

    HAP/SAMPLE conversion:

    convert from hap/sample format to VCF. The columns of .hap file are similar to .gen file above, but there are only two haplotype columns per -sample. Note that the first column of the .hap file is expected to be in -the form "CHR:POS_REF_ALT(_END)?", with the _END being optional for -defining the INFO/END tag when ALT is a symbolic allele, for example:

    +sample. Note that the first or the second column of the .hap file is expected to be in +the form "CHR:POS_REF_ALT[_END]", with the _END being optional for +defining the INFO/END tag when ALT is a symbolic allele. For example:

    -
      .hap
    -  ----
    +
      .hap (with --vcf-ids)
    +  ---------------------
       1:111485207_G_A rsID1 111485207 G A 0 1 0 0
    -  1:111494194_C_T rsID2 111494194 C T 0 1 0 0
    -  1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0
    + 1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0 + + .hap (the default) + ------------------ + 1 1:111485207_G_A 111485207 G A 0 1 0 0 + 1 1:111495231_A_<DEL>_111495784 111495231 A <DEL> 0 0 1 0
    @@ -1463,7 +1538,10 @@

    HAP/SAMPLE conversion:

    --vcf-ids
    -

    output VCF IDs instead of "CHROM:POS_REF_ALT" IDs

    +

    the second column of the .hap file holds the VCF ids, the first +column is of the form "CHR:POS_REF_ALT[_END]". Without the option, +the format follows https://www.cog-genomics.org/plink/2.0/formats#haps +with ids (the second column) of the form "CHR:POS_REF_ALT[_END]"

    @@ -1474,15 +1552,15 @@

    HAP/LEGEND/SAMPLE conversion:

    -H, --haplegendsample2vcf prefix or hap-file,legend-file,sample-file
    -

    convert from hap/legend/sample format used by IMPUTE2 to VCF, see -also -h, --hapslegendsample below.

    +

    convert from hap/legend/sample format used by IMPUTE2 to VCF. +See also -h, --hapslegendsample below.

    -h, --haplegendsample prefix or hap-file,legend-file,sample-file

    convert from VCF to hap/legend/sample format used by IMPUTE2 and SHAPEIT. The columns of .legend file ID,POS,REF,ALT. In order to prevent strand -swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". The .sample -file is quite basic at the moment with columns for population, group and +swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". +The .sample file is quite basic at the moment with columns for population, group and sex expected to be edited by the user. For example:

    @@ -1532,7 +1610,8 @@

    HAP/LEGEND/SAMPLE conversion:

    --vcf-ids
    -

    output VCF IDs instead of "CHROM:POS_REF_ALT" IDs

    +

    output VCF IDs instead of "CHROM:POS_REF_ALT". Note that this option can +be used with --haplegendsample but not with --haplegendsample2vcf.

    @@ -1759,6 +1838,10 @@

    bcftools csq [OPTIONS] FILE

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --samples LIST

    samples to include or "-" to apply all variants and ignore samples

    @@ -1775,6 +1858,10 @@

    bcftools csq [OPTIONS] FILE

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    @@ -1886,6 +1973,19 @@

    bcftools filter [OPTIONS] FILE

    include only sites for which EXPRESSION is true. For valid expressions see EXPRESSIONS.

    +
    --mask [^]REGION
    +
    +

    Soft filter regions, prepepend "^" to negate. Requires -s, --soft-filter.

    +
    +
    -M, --mask-file [^]FILE
    +
    +

    Soft filter regions listed in a file, "^" to negate. Requires -s, --soft-filter.

    +
    +
    --mask-overlap 0|1|2
    +
    +

    Same as --regions-overlap but for --mask/--mask-file. +See Common Options. [1]

    +
    -m, --mode [+x]

    define behaviour at sites with existing FILTER annotations. The default @@ -1915,6 +2015,10 @@

    bcftools filter [OPTIONS] FILE

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --soft-filter STRING|+

    annotate FILTER column with STRING or, with +, a unique filter name generated @@ -1932,6 +2036,10 @@

    bcftools filter [OPTIONS] FILE

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    --threads INT

    see Common Options

    @@ -2010,6 +2118,10 @@

    bcftools gtcheck [OPTIONS] [-g ge

    Restrict to regions listed in a file, see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +

    @@ -2032,6 +2144,10 @@

    bcftools gtcheck [OPTIONS] [-g ge

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    -u, --use TAG1[,TAG2]

    specifies which tag to use in the query file (TAG1) and the -g (TAG2) file. @@ -2057,12 +2173,43 @@

    bcftools gtcheck [OPTIONS] [-g ge

    + +
    +

    By default, prints all headers from the specified input file to standard output +in VCF format. The input file may be in VCF or BCF format; if no FILE is +specified, standard input will be read. With appropriate options, only some +of the headers and/or additionally some of the variant records will be printed.

    +
    +
    +

    The bcftools head command outputs VCF headers almost exactly as they appear +in the input file: it may add a ##FILTER=<ID=PASS> header if not already +present, but it never adds version or command line information itself.

    +
    +
    +

    Options:

    +
    +
    +
    -h, --header INT
    +
    +

    Display only the first INT header lines. +By default, all header lines are displayed.

    +
    +
    -n, --records INT
    +
    +

    Also display the first INT variant records. +By default, no variant records are displayed.

    +
    +
    +
    +
    +
    +

    bcftools index [OPTIONS] in.bcf|in.vcf.gz

    Creates index for bgzip compressed VCF/BCF files for random access. CSI (coordinate-sorted index) is created by default. The CSI format -supports indexing of chromosomes up to length 2^31. TBI (tabix index) -index files, which support chromosome lengths up to 2^29, can be +supports indexing of chromosomes up to length 2^31. TBI (tabix index) +index files, which support chromosome lengths up to 2^29, can be created by using the -t/--tbi option or using the tabix program packaged with htslib. When loading an index file, bcftools will try the CSI first and then the TBI.

    @@ -2081,7 +2228,7 @@

    Indexing options:

    -m, --min-shift INT
    -

    set minimal interval size for CSI indices to 2^INT; default: 14

    +

    set minimal interval size for CSI indices to 2^INT; default: 14

    -o, --output FILE
    @@ -2180,6 +2327,10 @@

    bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -t, --targets chr|chr:pos|chr:from-to|chr:from-[,…​]

    see Common Options

    @@ -2188,6 +2339,10 @@

    bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    -w, --write LIST

    list of input files to output given as 1-based indices. With -p and no @@ -2367,6 +2522,10 @@

    bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz<

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    --threads INT

    see Common Options

    @@ -2475,9 +2634,9 @@

    Input options

    Do not require the --fasta-ref option.

    -
    -G, --read-groups FILE
    +
    -G, --read-groups FILE
    -

    list of read groups to include or exclude if prefixed with "^". +

    list of read groups to include or exclude if prefixed with "^". One read group per line. This file can also be used to assign new sample names to read groups by giving the new sample name as a second white-space-separated field, like this: "read_group_id new_sample_name". @@ -2512,7 +2671,7 @@

    Input options

    Minimum base quality for a base to be considered [13]

    -
    * --max-BQ* INT
    +
    --max-BQ INT

    Caps the base quality to a maximum value [60]. This can be particularly useful on technologies that produce overly optimistic @@ -2530,25 +2689,39 @@

    Input options

    As for -r, --regions, but regions read from FILE; see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    --ignore-RG

    Ignore RG tags. Treat all reads in one alignment file as one sample.

    -
    --rf, --incl-flags STR|INT
    +
    --ls, --skip-all-set
    +
    +

    Skip reads with all of the FLAG bits set [null]

    +
    +
    --ns, --skip-any-set
    +
    +

    Skip reads with any of the FLAG bits set. This option replaces and +is synonymous to the deprecated --ff, --excl-flags [UNMAP,SECONDARY,QCFAIL,DUP]

    +
    +
    --lu, --skip-all-unset
    -

    Required flags: skip reads with mask bits unset [null]

    +

    Skip reads with all of the FLAG bits unset. This option replaces and +is synonymous to the deprecated --rf, --incl-flags [null]

    -
    --ff, --excl-flags STR|INT
    +
    --nu, --skip-any-unset
    -

    Filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP]

    +

    Skip reads with any of the FLAG bits unset [null]

    -
    -s, --samples LIST
    +
    -s, --samples LIST

    list of sample names. See Common Options

    -
    -S, --samples-file FILE
    +
    -S, --samples-file FILE
    -

    file of sample names to include or exclude if prefixed with "^". +

    file of sample names to include or exclude if prefixed with "^". One sample per line. This file can also be used to rename samples by giving the new sample name as a second white-space-separated column, like this: "old_name new_name". If a sample name contains spaces, the spaces can be @@ -2563,6 +2736,10 @@

    Input options

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    -x, --ignore-overlaps

    Disable read-pair overlap detection.

    @@ -2914,6 +3091,10 @@

    bcftools norm [OPTIONS] file.vcf.gz

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --strict-filter

    when merging (-m+), merged site is PASS only if all sites being merged PASS

    @@ -2926,6 +3107,10 @@

    bcftools norm [OPTIONS] file.vcf.gz

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    --threads INT

    see Common Options

    @@ -2971,6 +3156,10 @@

    VCF input options:

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -t, --targets chr|chr:pos|chr:from-to|chr:from-[,…​]

    see Common Options

    @@ -2979,6 +3168,10 @@

    VCF input options:

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +

    @@ -3368,6 +3561,10 @@

    General options:

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --sample string

    sample name

    @@ -3380,6 +3577,10 @@

    General options:

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    -v, --verbose

    verbose debugging output which gives hints about the thresholds and decisions made @@ -3440,6 +3641,10 @@

    bcftools query [OPTIONS] file.vcf.gz [file.

    exclude sites for which EXPRESSION is true. For valid expressions see EXPRESSIONS.

    +
    --force-samples
    +
    +

    continue even when some samples requested via -s/-S do not exist

    +
    -f, --format FORMAT

    learn by example, see below

    @@ -3469,6 +3674,10 @@

    bcftools query [OPTIONS] file.vcf.gz [file.

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --samples LIST

    see Common Options

    @@ -3485,6 +3694,10 @@

    bcftools query [OPTIONS] file.vcf.gz [file.

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    -u, --allow-undef-tags

    do not throw an error if there are undefined tags in the format string, @@ -3762,6 +3975,10 @@

    General Options:

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --samples LIST

    see Common Options

    @@ -3778,6 +3995,10 @@

    General Options:

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    @@ -3916,6 +4137,10 @@

    bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -s, --samples LIST

    see Common Options

    @@ -3932,6 +4157,10 @@

    bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    -u, --user-tstv <TAG[:min:max:n]>

    collect Ts/Tv stats for any tag using the given binning [0:1:100]

    @@ -3959,7 +4188,7 @@

    Output options

    -h, --header-only
    -

    output the VCF header only

    +

    output the VCF header only (see also bcftools head)

    -H, --no-header
    @@ -3999,6 +4228,10 @@

    Output options

    see Common Options

    +
    --regions-overlap 0|1|2
    +
    +

    see Common Options

    +
    -t, --targets chr|chr:pos|chr:from-to|chr:from-[,…​]

    see Common Options

    @@ -4007,6 +4240,10 @@

    Output options

    see Common Options

    +
    --targets-overlap 0|1|2
    +
    +

    see Common Options

    +
    --threads INT

    see Common Options

    @@ -4092,11 +4329,11 @@

    Filter options:

    see Common Options

    -
    -g, --genotype [^][hom|het|miss]
    +
    -g, --genotype [^][hom|het|miss]

    include only sites with one or more homozygous (hom), heterozygous -(het) or missing (miss) genotypes. When prefixed with ^, the logic -is reversed; thus ^het excludes sites with heterozygous genotypes.

    +(het) or missing (miss) genotypes. When prefixed with ^, the logic +is reversed; thus ^het excludes sites with heterozygous genotypes.

    -i, --include EXPRESSION
    @@ -4751,7 +4988,7 @@

    COPYING

    diff --git a/doc/bcftools.txt b/doc/bcftools.txt index e2139edf7..aac53f531 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -85,6 +85,7 @@ list of available options, run *bcftools* 'COMMAND' without arguments. - *<>* .. haplotype aware consequence caller - *<>* .. filter VCF/BCF files using fixed thresholds - *<>* .. check sample concordance, detect sample swaps and contamination +- *<>* .. view VCF/BCF file headers - *<>* .. index VCF/BCF - *<>* .. intersections of VCF/BCF files - *<>* .. merge VCF/BCF files files from non-overlapping sample sets @@ -209,18 +210,20 @@ specific commands to see if they apply. This option requires indexed VCF/BCF files. Note that *-R* cannot be used in combination with *-r*. -*--regions-overlap* '0'|'1'|'2':: +*--regions-overlap* 'pos'|'record'|'variant'|'0'|'1'|'2':: This option controls how overlapping records are determined: - set to *0* if the VCF record has to have POS inside a region - (this corresponds to the default behavior of *-t/-T*); - set to *1* if also overlapping records with POS outside a region - should be included (this is the default behavior of *-r/-R*); or set - to *2* to include only true overlapping variation (compare + set to *pos* or *0* if the VCF record has to have POS inside a region + (this corresponds to the default behavior of *-t/-T*); + set to *record* or *1* if also overlapping records with POS outside a region + should be included (this is the default behavior of *-r/-R*, and includes indels + with POS at the end of a region, which are technically outside the region); or set + to *variant* or *2* to include only true overlapping variation (compare the full VCF representation "`TA>T-`" vs the true sequence variation "`A>-`"). -*-s, --samples* \[^]'LIST':: +*-s, --samples* \[^]'LIST':: Comma-separated list of samples to include or exclude if prefixed - with "^". + with "^." (Note that when multiple samples are to be excluded, + the "^" prefix is still present only once, e.g. "^SAMPLE1,SAMPLE2".) The sample order is updated to reflect that given on the command line. Note that in general tags such as INFO/AC, INFO/AN, etc are not updated to correspond to the subset samples. *<>* is the @@ -232,8 +235,8 @@ specific commands to see if they apply. bcftools view -Ou -s sample1,sample2 file.vcf | bcftools query -f %INFO/AC\t%INFO/AN\n ---- -*-S, --samples-file* [^]'FILE'[[samples_file]]:: - File of sample names to include or exclude if prefixed with "^". +*-S, --samples-file* \[^]'FILE'[[samples_file]]:: + File of sample names to include or exclude if prefixed with "^". One sample per line. See also the note above for the *-s, --samples* option. The sample order is updated to reflect that given in the input file. @@ -263,20 +266,21 @@ The program ignores the first column and the last indicates sex (1=male, 2=femal ignored_column sonB fatherB motherB 1 ---- -*-t, --targets* \[^]'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: +*-t, --targets* \[^]'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: Similar as *-r, --regions*, but the next position is accessed by streaming the whole VCF/BCF rather than using the tbi/csi index. Both *-r* and *-t* options can be applied simultaneously: *-r* uses the index to jump to a region and *-t* discards positions which are not in the targets. Unlike *-r*, targets - can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" + can be prefixed with "^" to request logical complement. For example, "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. Yet another difference between the *-t/-T* and *-r/-R* is that *-r/-R* checks for proper overlaps and considers both POS and the end position of an indel, while *-t/-T* - considers the POS coordinate only. Note that *-t* cannot be used in combination with *-T*. + considers the POS coordinate only (by default; see also *--regions-overlap* and *--targets-overlap*). + Note that *-t* cannot be used in combination with *-T*. -*-T, --targets-file* \[^]'FILE':: +*-T, --targets-file* \[^]'FILE':: Same *-t, --targets*, but reads regions from a file. Note that *-T* - cannot be used in combination with *-t*. + cannot be used in combination with *-t*. + With the *call -C* 'alleles' command, third column of the targets file must be comma-separated list of alleles, starting with the reference allele. @@ -287,7 +291,7 @@ Such a file can be easily created from a VCF using: bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' file.vcf | bgzip -c > als.tsv.gz && tabix -s1 -b2 -e2 als.tsv.gz ---- -*--targets-overlap* '0'|'1'|'2':: +*--targets-overlap* 'pos'|'record'|'variant'|'0'|'1'|'2':: Same as *--regions-overlap* but for *-t/-T*. *--threads* 'INT':: @@ -335,11 +339,6 @@ Add or remove annotations. 1 798959 SomeOtherString 6 ---- -*--collapse* 'snps'|'indels'|'both'|'all'|'some'|'none':: - Controls how to match records from the annotation file to the target VCF. - Effective only when *-a* is a VCF or BCF. - See *<>* for more. - *-c, --columns* 'list':: Comma-separated list of columns or tags to carry over from the annotation file (see also *-a, --annotations*). If the annotation file is not a VCF/BCF, @@ -356,7 +355,7 @@ Add or remove annotations. {nbsp} + {nbsp} + To carry over all INFO annotations, use "INFO". To add all INFO annotations except - "TAG", use "^INFO/TAG". By default, existing values are replaced. + "TAG", use "^INFO/TAG". By default, existing values are replaced. {nbsp} + {nbsp} + By default, existing tags are overwritten unless the source value is a missing value (i.e. "."). @@ -401,7 +400,7 @@ Add or remove annotations. ##INFO= ---- -*-I, --set-id* \[+]'FORMAT':: +*-I, --set-id* \[+]'FORMAT':: assign ID on the fly. The format is the same as in the *<>* command (see below). By default all existing IDs are replaced. If the format string is preceded by "+", only missing IDs will be set. For example, @@ -434,6 +433,13 @@ Add or remove annotations. *-m, --mark-sites* [+-]'TAG':: annotate sites which are present ("+") or absent ("-") in the *-a* file with a new INFO/TAG flag +*--min-overlap* 'ANN':'VCF':: + minimum overlap required as a fraction of the variant in the annotation *-a* file ('ANN'), in the + target VCF file (':VCF'), or both for reciprocal overlap ('ANN:VCF'). + By default overlaps of arbitrary length are sufficient. + The option can be used only with the tab-delimited annotation *-a* file and with 'BEG' and 'END' + columns present. + *--no-version*:: see *<>* @@ -443,6 +449,11 @@ Add or remove annotations. *-O, --output-type* 'b'|'u'|'z'|'v'[0-9]:: see *<>* +*--pair-logic* 'snps'|'indels'|'both'|'all'|'some'|'exact':: + Controls how to match records from the annotation file to the target VCF. + Effective only when *-a* is a VCF or BCF. The option replaces the former + uninuitive *--collapse*. + See *<>* for more. *-r, --regions* 'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: see *<>* @@ -450,6 +461,9 @@ Add or remove annotations. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *--rename-annots* 'file':: rename annotations according to the map in 'file', with "old_name new_name\n" pairs separated by whitespaces, each on a separate @@ -461,7 +475,7 @@ Add or remove annotations. "old_name new_name\n" pairs separated by whitespaces, each on a separate line. -*-s, --samples* \[^]'LIST':: +*-s, --samples* \[^]'LIST':: subset of samples to annotate, see also *<>* *-S, --samples-file* 'FILE':: @@ -473,7 +487,7 @@ Add or remove annotations. *--single-overlaps*:: use this option to keep memory requirements low with very large annotation files. Note, however, that this comes at a cost, only single overlapping intervals - are considered in this mode. This was the default mode until the commit + are considered in this mode. This was the default mode until the commit af6f0c9 (Feb 24 2019). *--threads* 'INT':: @@ -484,7 +498,7 @@ Add or remove annotations. "FILTER/SomeFilter" to remove a specific filter. Similarly, "INFO" can be used to remove all INFO tags and "FORMAT" to remove all FORMAT tags except GT. To remove all INFO tags except "FOO" and "BAR", use - "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). + "^INFO/FOO,INFO/BAR" (and similarly for FORMAT and FILTER). "INFO" can be abbreviated to "INF" and "FORMAT" to "FMT". *Examples:* @@ -569,6 +583,9 @@ demand. The original calling model can be invoked with the *-c* option. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --samples* 'LIST':: see *<>* @@ -581,6 +598,9 @@ demand. The original calling model can be invoked with the *-c* option. *-T, --targets-file* 'FILE':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *--threads* 'INT':: see *<>* @@ -593,7 +613,7 @@ demand. The original calling model can be invoked with the *-c* option. *-f, --format-fields* 'list':: comma-separated list of FORMAT fields to output for each sample. Currently GQ and GP fields are supported. For convenience, the fields can be given - as lower case letters. Prefixed with "^" indicates a request for tag + as lower case letters. Prefixed with "^" indicates a request for tag removal of auxiliary tags useful only for calling. *-F, --prior-freqs* 'AN','AC':: @@ -622,7 +642,7 @@ demand. The original calling model can be invoked with the *-c* option. text file with sample names in the first column and group names in the second column. If '-' is given instead, no HWE assumption is made at all and single-sample calling is performed. (Note that in low coverage data this inflates the rate of false positives.) The *-G* option requires the presence of - per-sample FORMAT/QS or FORMAT/AD tag generated with *bcftools mpileup -a QS* (or *-a AD*). + per-sample FORMAT/QS or FORMAT/AD tag generated with *bcftools mpileup -a QS* (or *-a AD*). *-g, --gvcf* 'INT':: output also gVCF blocks of homozygous REF calls. The parameter 'INT' is the @@ -720,6 +740,9 @@ loss), 0 (complete loss), 3 (single-copy gain). *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --query-sample* 'string':: query sample name @@ -729,6 +752,9 @@ loss), 0 (complete loss), 3 (single-copy gain). *-T, --targets-file* 'FILE':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + ==== HMM Options: *-a, --aberrant* 'float'[,'float']:: @@ -846,6 +872,9 @@ are concatenated without being recompressed, which is very fast.. *-R, --regions-file* 'FILE':: see *<>*. Requires *-a, --allow-overlaps*. +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *--threads* 'INT':: see *<>* @@ -860,6 +889,9 @@ Note that the program does not act as a primitive variant caller and ignores all depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *setGT* plugin. +*-a, --absent* 'CHAR':: + replace positions absent from VCF with CHAR + *-c, --chain* 'FILE':: write a chain file for liftover @@ -872,7 +904,7 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-H, --haplotype* '1'|'2'|'R'|'A'|'I'|'LR'|'LA'|'SR'|'SA'|'1pIu'|'2pIu':: choose which allele from the FORMAT/GT field to use (the codes are case-insensitive): - + '1';; the first allele, regardless of phasing @@ -963,6 +995,9 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-R, --regions-file* 'FILE':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --samples* 'LIST':: see *<>* @@ -975,6 +1010,9 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-T, --targets-file* 'FILE':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + ==== VCF output options: *--no-version*:: @@ -991,20 +1029,39 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the ==== GEN/SAMPLE conversion: *-G, --gensample2vcf* 'prefix' or 'gen-file','sample-file':: - convert IMPUTE2 output to VCF. The second column must be of the form - "CHROM:POS_REF_ALT" to detect possible strand swaps; IMPUTE2 leaves the - first one empty ("--") when sites from reference panel are filled in. See - also *-g* below. + convert IMPUTE2 output to VCF. One of the ID columns ("SNP ID" or "rsID" in + https://www.cog-genomics.org/plink/2.0/formats#gen) must be of the form + "CHROM:POS_REF_ALT" to detect possible strand swaps. + {nbsp} + + When the *--vcf-ids* option is given, the other column (autodetected) is used + to fill the ID column of the VCF. + {nbsp} + + See also *-g* and *--3N6* options. *-g, --gensample* 'prefix' or 'gen-file','sample-file':: convert from VCF to gen/sample format used by IMPUTE2 and SHAPEIT. The columns of .gen file format are ID1,ID2,POS,A,B followed by three genotype probabilities P(AA), P(AB), P(BB) for each sample. In order to prevent strand swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". - For example: + When the *--vcf-ids* option is given, the second column is set to match the ID + column of the VCF. + {nbsp} + + See also *-G* and *--3N6* options. + {nbsp} + + The file .gen and .sample file format are: ---- - .gen - ---- + .gen (with --3N6 --vcf-ids) + --------------------------- + chr1 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0 + chr1 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1 + + .gen (with --vcf-ids) + --------------------------- + 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1 0 + 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 0 1 + + .gen (the default) + ------------------------------ 1:111485207_G_A 1:111485207_G_A 111485207 G A 0 1 0 0 1 0 1:111494194_C_T 1:111494194_C_T 111494194 C T 0 1 0 0 0 1 @@ -1016,12 +1073,13 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the sample2 sample2 0 ---- +*--3N6*:: + Expect/Create files in the 3*N+6 column format. This is the new .gen file format with the first + column containing the chromosome name, see https://www.cog-genomics.org/plink/2.0/formats#gen + *--tag* 'STRING':: tag to take values for .gen file: GT,PL,GL,GP -*--chrom*:: - output chromosome in the first column instead of CHROM:POS_REF_ALT - *--sex* 'FILE':: output sex column in the sample file. The FILE format is ---- @@ -1049,15 +1107,19 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *--hapsample2vcf* 'prefix' or 'hap-file','sample-file':: convert from hap/sample format to VCF. The columns of .hap file are similar to .gen file above, but there are only two haplotype columns per - sample. Note that the first column of the .hap file is expected to be in - the form "CHR:POS_REF_ALT(_END)?", with the _END being optional for - defining the INFO/END tag when ALT is a symbolic allele, for example: + sample. Note that the first or the second column of the .hap file is expected to be in + the form "CHR:POS_REF_ALT[_END]", with the _END being optional for + defining the INFO/END tag when ALT is a symbolic allele. For example: ---- - .hap - ---- + .hap (with --vcf-ids) + --------------------- 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 - 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 1:111495231_A__111495784 rsID3 111495231 A 0 0 1 0 + + .hap (the default) + ------------------ + 1 1:111485207_G_A 111485207 G A 0 1 0 0 + 1 1:111495231_A__111495784 111495231 A 0 0 1 0 ---- *--hapsample* 'prefix' or 'hap-file','sample-file':: @@ -1080,18 +1142,22 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the ---- *--vcf-ids*:: - output VCF IDs instead of "CHROM:POS_REF_ALT" IDs + the second column of the .hap file holds the VCF ids, the first + column is of the form "CHR:POS_REF_ALT[_END]". Without the option, + the format follows https://www.cog-genomics.org/plink/2.0/formats#haps + with ids (the second column) of the form "CHR:POS_REF_ALT[_END]" + ==== HAP/LEGEND/SAMPLE conversion: *-H, --haplegendsample2vcf* 'prefix' or 'hap-file','legend-file','sample-file':: - convert from hap/legend/sample format used by IMPUTE2 to VCF, see - also *-h, --hapslegendsample* below. + convert from hap/legend/sample format used by IMPUTE2 to VCF. + See also *-h, --hapslegendsample* below. *-h, --haplegendsample* 'prefix' or 'hap-file','legend-file','sample-file':: convert from VCF to hap/legend/sample format used by IMPUTE2 and SHAPEIT. The columns of .legend file ID,POS,REF,ALT. In order to prevent strand - swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". The .sample - file is quite basic at the moment with columns for population, group and + swaps, the program uses IDs of the form "CHROM:POS_REF_ALT". + The .sample file is quite basic at the moment with columns for population, group and sex expected to be edited by the user. For example: ---- .hap @@ -1126,7 +1192,8 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the ---- *--vcf-ids*:: - output VCF IDs instead of "CHROM:POS_REF_ALT" IDs + output VCF IDs instead of "CHROM:POS_REF_ALT". Note that this option can + be used with *--haplegendsample* but not with *--haplegendsample2vcf*. ==== TSV conversion: *--tsv2vcf* 'file':: @@ -1224,7 +1291,7 @@ output VCF and are ignored for the prediction analysis. # # Attributes required for # gene lines: - # - ID=gene: + # - ID=gene: # - biotype= # - Name= [optional] # @@ -1299,6 +1366,9 @@ output VCF and are ignored for the prediction analysis. *-R, --regions-file* 'FILE':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --samples* 'LIST':: samples to include or "-" to apply all variants and ignore samples @@ -1311,6 +1381,9 @@ output VCF and are ignored for the prediction analysis. *-T, --targets-file* 'FILE':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *Examples:* ---- # Basic usage @@ -1397,6 +1470,16 @@ And similarly here, the second is filtered: include only sites for which 'EXPRESSION' is true. For valid expressions see *<>*. +*--mask* \[^]'REGION':: + Soft filter regions, prepepend "^" to negate. Requires *-s, --soft-filter*. + +*-M, --mask-file* \[^]'FILE':: + Soft filter regions listed in a file, "^" to negate. Requires *-s, --soft-filter*. + +*--mask-overlap* '0'|'1'|'2':: + Same as *--regions-overlap* but for *--mask/--mask-file*. + See *<>*. [1] + *-m, --mode* ['+x']:: define behaviour at sites with existing FILTER annotations. The default mode replaces existing filters of failed sites with a new FILTER string @@ -1420,6 +1503,9 @@ And similarly here, the second is filtered: *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --soft-filter* 'STRING'|'+':: annotate FILTER column with 'STRING' or, with '+', a unique filter name generated by the program ("Filter%d"). @@ -1433,6 +1519,9 @@ And similarly here, the second is filtered: *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *--threads* 'INT':: see *<>* @@ -1476,7 +1565,7 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i that average score is used to determine the top matches, not absolute values. *--no-HWE-prob*:: - Disable calculation of HWE probability to reduce memory requirements with + Disable calculation of HWE probability to reduce memory requirements with comparisons between very large number of sample pairs. *-p, --pairs* 'LIST':: @@ -1495,6 +1584,9 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i *-R, --regions-file' 'FILE':: Restrict to regions listed in a file, see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --samples* ['qry'|'gt']:'LIST': List of query samples or *-g* samples. If neither *-s* nor *-S* are given, all possible sample pair combinations are compared @@ -1509,6 +1601,9 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *-u, --use* 'TAG1'[,'TAG2']:: specifies which tag to use in the query file ('TAG1') and the *-g* ('TAG2') file. By default, the PL tag is used in the query file and GT in the *-g* file when @@ -1539,11 +1634,11 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i // present, a constant value '99' is used for the unseen genotypes. With // *-G*, the value '1' can be used instead; the discordance value then // gives exactly the number of differing genotypes. -// +// // ERR, error rate;; // Pairwise error rate calculated as number of differences divided // by the total number of comparisons. -// +// // CLUSTER, TH, DOT;; // In presence of multiple samples, related samples and outliers can be // identified by clustering samples by error rate. A simple hierarchical @@ -1556,12 +1651,33 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i // most dissimilar to the rest. +[[head]] +=== bcftools head ['OPTIONS'] ['FILE'] +By default, prints all headers from the specified input file to standard output +in VCF format. The input file may be in VCF or BCF format; if no 'FILE' is +specified, standard input will be read. With appropriate options, only some +of the headers and/or additionally some of the variant records will be printed. + +The *bcftools head* command outputs VCF headers almost exactly as they appear +in the input file: it may add a `##FILTER=` header if not already +present, but it never adds version or command line information itself. + +==== Options: +*-h, --header* 'INT':: + Display only the first 'INT' header lines. + By default, all header lines are displayed. + +*-n, --records* 'INT':: + Also display the first 'INT' variant records. + By default, no variant records are displayed. + + [[index]] === bcftools index ['OPTIONS'] 'in.bcf'|'in.vcf.gz' Creates index for bgzip compressed VCF/BCF files for random access. CSI (coordinate-sorted index) is created by default. The CSI format -supports indexing of chromosomes up to length 2^31. TBI (tabix index) -index files, which support chromosome lengths up to 2^29, can be +supports indexing of chromosomes up to length 2^31. TBI (tabix index) +index files, which support chromosome lengths up to 2^29, can be created by using the '-t/--tbi' option or using the 'tabix' program packaged with htslib. When loading an index file, bcftools will try the CSI first and then the TBI. @@ -1574,7 +1690,7 @@ the CSI first and then the TBI. overwrite index if it already exists *-m, --min-shift 'INT'*:: - set minimal interval size for CSI indices to 2^INT; default: 14 + set minimal interval size for CSI indices to 2^INT; default: 14 *-o, --output 'FILE'*:: output file name. If not set, then the index will be created @@ -1644,12 +1760,18 @@ in the other files. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-t, --targets* 'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: see *<>* *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *-w, --write* 'LIST':: list of input files to output given as 1-based indices. With *-p* and no *-w*, all files are written. @@ -1751,7 +1873,7 @@ For "vertical" merge take a look at *<>* or *<>* or *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *--threads* 'INT':: see *<>* @@ -1873,8 +1998,8 @@ those scenarios. *--no-reference*:: Do not require the *--fasta-ref* option. -*-G, --read-groups* [^]'FILE':: - list of read groups to include or exclude if prefixed with "^". +*-G, --read-groups* [^]'FILE':: + list of read groups to include or exclude if prefixed with "^". One read group per line. This file can also be used to assign new sample names to read groups by giving the new sample name as a second white-space-separated field, like this: "read_group_id new_sample_name". @@ -1901,7 +2026,7 @@ those scenarios. *-Q, --min-BQ* 'INT':: Minimum base quality for a base to be considered [13] -* --max-BQ* 'INT':: +*--max-BQ* 'INT':: Caps the base quality to a maximum value [60]. This can be particularly useful on technologies that produce overly optimistic high qualities, leading to too many false positives or incorrect @@ -1916,20 +2041,31 @@ those scenarios. As for *-r, --regions*, but regions read from FILE; see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *--ignore-RG*:: Ignore RG tags. Treat all reads in one alignment file as one sample. -*--rf, --incl-flags* 'STR'|'INT':: - Required flags: skip reads with mask bits unset [null] +*--ls, --skip-all-set*:: + Skip reads with all of the FLAG bits set [null] + +*--ns, --skip-any-set*:: + Skip reads with any of the FLAG bits set. This option replaces and + is synonymous to the deprecated *--ff, --excl-flags* [UNMAP,SECONDARY,QCFAIL,DUP] + +*--lu, --skip-all-unset*:: + Skip reads with all of the FLAG bits unset. This option replaces and + is synonymous to the deprecated *--rf, --incl-flags* [null] -*--ff, --excl-flags* 'STR'|'INT':: - Filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP] +*--nu, --skip-any-unset*:: + Skip reads with any of the FLAG bits unset [null] -*-s, --samples* [^]'LIST':: +*-s, --samples* [^]'LIST':: list of sample names. See *<>* -*-S, --samples-file* [^]'FILE':: - file of sample names to include or exclude if prefixed with "^". +*-S, --samples-file* [^]'FILE':: + file of sample names to include or exclude if prefixed with "^". One sample per line. This file can also be used to rename samples by giving the new sample name as a second white-space-separated column, like this: "old_name new_name". If a sample name contains spaces, the spaces can be @@ -1942,6 +2078,9 @@ those scenarios. *-T, --targets-file* 'FILE':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *-x, --ignore-overlaps*:: Disable read-pair overlap detection. @@ -2129,7 +2268,7 @@ the *<>* option is supplied. See also *--atom-overlaps* and *--old-rec-tag*. *--atom-overlaps* '.'|'*':: - Alleles missing because of an overlapping variant can be set either + Alleles missing because of an overlapping variant can be set either to missing (.) or to the star alele (*), as recommended by the VCF specification. IMPORTANT: Note that asterisk is expaneded by shell and must be put in quotes or escaped by a backslash: @@ -2159,7 +2298,7 @@ the *<>* option is supplied. can swap alleles and will update genotypes (GT) and AC counts, but will not attempt to fix PL or other fields. Also note, and this cannot be stressed enough, that 's' will NOT fix strand issues in - your VCF, do NOT use it for that purpose!!! (Instead see + your VCF, do NOT use it for that purpose!!! (Instead see and .) @@ -2203,7 +2342,7 @@ the *<>* option is supplied. *--old-rec-tag* 'STR':: Add INFO/STR annotation with the original record. The format of the - annotation is CHROM|POS|REF|ALT|USED_ALT_IDX. + annotation is CHROM|POS|REF|ALT|USED_ALT_IDX. *-o, --output* 'FILE':: see *<>* @@ -2217,6 +2356,9 @@ the *<>* option is supplied. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --strict-filter*:: when merging ('-m+'), merged site is PASS only if all sites being merged PASS @@ -2226,6 +2368,9 @@ the *<>* option is supplied. *-T, --targets-file* 'FILE':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *--threads* 'INT':: see *<>* @@ -2264,12 +2409,18 @@ the usage examples that each plugin comes with. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-t, --targets* 'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: see *<>* *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + ==== VCF output options: *--no-version*:: @@ -2529,6 +2680,9 @@ file for help. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --sample* 'string':: sample name @@ -2538,6 +2692,9 @@ file for help. *-T, --targets-file* 'FILE':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *-v, --verbose*:: verbose debugging output which gives hints about the thresholds and decisions made by the program. Note that the exact output can change between versions. @@ -2579,6 +2736,9 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format. exclude sites for which 'EXPRESSION' is true. For valid expressions see *<>*. +*--force-samples*:: + continue even when some samples requested via *-s/-S* do not exist + *-f, --format* 'FORMAT':: learn by example, see below @@ -2601,6 +2761,9 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --samples* 'LIST':: see *<>* @@ -2613,6 +2776,9 @@ Extracts fields from VCF or BCF files and outputs them in user-defined format. *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *-u, --allow-undef-tags*:: do not throw an error if there are undefined tags in the format string, print "." instead @@ -2795,11 +2961,11 @@ Transition probabilities: *-M, --rec-rate* 'FLOAT':: constant recombination rate per bp. In combination with *--genetic-map*, - the *--rec-rate* parameter is interpreted differently, as 'FLOAT'-fold increase of + the *--rec-rate* parameter is interpreted differently, as 'FLOAT'-fold increase of transition probabilities, which allows the model to become more sensitive yet still account for recombination hotspots. Note that also the range of the values is therefore different in both cases: normally the - parameter will be in the range (1e-3,1e-9) but with *--genetic-map* + parameter will be in the range (1e-3,1e-9) but with *--genetic-map* it will be in the range (10,1000). *-o, --output* 'FILE':: @@ -2816,6 +2982,9 @@ Transition probabilities: *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --samples* 'LIST':: see *<>* @@ -2828,6 +2997,9 @@ Transition probabilities: *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + ==== HMM Options: *-a, --hw-to-az* 'FLOAT':: @@ -2926,6 +3098,9 @@ and correlation are also printed. Per-site discordance (PSD) is also printed in *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-s, --samples* 'LIST':: see *<>* @@ -2938,6 +3113,9 @@ and correlation are also printed. Per-site discordance (PSD) is also printed in *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *-u, --user-tstv* '':: collect Ts/Tv stats for any tag using the given binning [0:1:100] @@ -2956,7 +3134,7 @@ Convert between VCF and BCF. Former *bcftools subset*. drop individual genotype information (after subsetting if *-s* option is set) *-h, --header-only*:: - output the VCF header only + output the VCF header only (see also *<>*) *-H, --no-header*:: suppress the header in VCF output @@ -2984,12 +3162,18 @@ Convert between VCF and BCF. Former *bcftools subset*. *-R, --regions-file* 'file':: see *<>* +*--regions-overlap* '0'|'1'|'2':: + see *<>* + *-t, --targets* 'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: see *<>* *-T, --targets-file* 'file':: see *<>* +*--targets-overlap* '0'|'1'|'2':: + see *<>* + *--threads* 'INT':: see *<>* @@ -3020,7 +3204,7 @@ Convert between VCF and BCF. Former *bcftools subset*. Note that filter options below dealing with counting the number of alleles will, for speed, first check for the values of AC and AN in the INFO column to avoid parsing all the genotype (FORMAT/GT) fields in the VCF. This means -that a filter like '--min-af 0.1' will be calculated from INFO/AC and INFO/AN +that a filter like '--min-af 0.1' will be calculated from INFO/AC and INFO/AN when available or FORMAT/GT otherwise. However, it will not attempt to use any other existing field, like INFO/AF for example. For that, use '--exclude AF<0.1' instead. @@ -3053,10 +3237,10 @@ when piping!) *-f, --apply-filters* 'LIST':: see *<>* -*-g, --genotype* [^]['hom'|'het'|'miss']:: +*-g, --genotype* [^]['hom'|'het'|'miss']:: include only sites with one or more homozygous ('hom'), heterozygous - ('het') or missing ('miss') genotypes. When prefixed with '^', the logic - is reversed; thus '^het' excludes sites with heterozygous genotypes. + ('het') or missing ('miss') genotypes. When prefixed with '^', the logic + is reversed; thus '^het' excludes sites with heterozygous genotypes. *-i, --include* 'EXPRESSION':: include sites for which 'EXPRESSION' is true. For valid expressions see @@ -3239,7 +3423,7 @@ to require that all alleles are of the given type. Compare * array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that for querying FORMAT vectors, the colon ":" can be used to select a sample and an element of the vector, as shown in the examples below - + INFO/AF[0] > 0.3 .. first AF value bigger than 0.3 FORMAT/AD[0:0] > 30 .. first AD value of the first sample bigger than 30 FORMAT/AD[0:1] .. first sample, second AD value @@ -3352,7 +3536,7 @@ used on the result. For example, when querying "TAG=1,2,3,4", it will be evaluat TYPE="snp" && QUAL>=10 && (DP4[2]+DP4[3] > 2) - COUNT(GT="hom")=0 .. no homozygous genotypes at the site + COUNT(GT="hom")=0 .. no homozygous genotypes at the site AVG(GQ)>50 .. average (arithmetic mean) of genotype qualities bigger than 50 diff --git a/filter.c b/filter.c index 3c451950f..7ff006ebb 100644 --- a/filter.c +++ b/filter.c @@ -1,6 +1,6 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -152,6 +152,8 @@ struct _filter_t static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; #define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently +static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok); + // Return negative values if it is a function with variable number of arguments static int filters_next_token(char **str, int *len) { @@ -471,16 +473,15 @@ static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1 } static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line) { - // multiple IDs not supported yet (easy to add though) - if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) - error("Only == and != operators are supported for ID\n"); - if ( btok->hash ) { token_t *tmp = atok; atok = btok; btok = tmp; } if ( atok->hash ) { + if ( rtok->tok_type!=TOK_EQ && rtok->tok_type!=TOK_NE ) + error("Only == and != operators are supported for strings read from a file\n"); + int ret = khash_str2int_has_key(atok->hash, line->d.id); if ( rtok->tok_type==TOK_NE ) ret = ret ? 0 : 1; rtok->pass_site = ret; @@ -491,8 +492,19 @@ static void filters_cmp_id(token_t *atok, token_t *btok, token_t *rtok, bcf1_t * if ( rtok->tok_type==TOK_EQ ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1; - else + else if ( rtok->tok_type==TOK_NE ) rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0; + else + { + if ( rtok->tok_type!=TOK_LIKE && rtok->tok_type!=TOK_NLIKE ) + error("Only the following operators are supported for querying ID: ==, !=, ~, !~; the operator type %d is not supported (%p %p)\n", + rtok->tok_type,atok->regex,btok->regex); + + regex_t *regex = atok->regex ? atok->regex : (btok->regex ? btok->regex : NULL); + if ( !regex ) error("fixme: regex initialization failed\n"); + rtok->pass_site = regexec(regex,line->d.id, 0,NULL,0) ? 0 : 1; + if ( rtok->tok_type==TOK_NLIKE ) rtok->pass_site = rtok->pass_site ? 0 : 1; + } } /** @@ -1902,7 +1914,11 @@ static int func_phred(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac } inline static void tok_init_values(token_t *atok, token_t *btok, token_t *rtok) { - token_t *tok = atok->nvalues > btok->nvalues ? atok : btok; + token_t *tok; + if ( (atok->nsamples || btok->nsamples) && (!atok->nsamples || !btok->nsamples) ) + tok = atok->nsamples ? atok : btok; + else + tok = atok->nvalues > btok->nvalues ? atok : btok; rtok->nvalues = tok->nvalues; rtok->nval1 = tok->nval1; hts_expand(double, rtok->nvalues, rtok->mvalues, rtok->values); diff --git a/main.c b/main.c index f89271108..3a0d557b7 100644 --- a/main.c +++ b/main.c @@ -43,6 +43,7 @@ int main_vcfsom(int argc, char *argv[]); int main_vcfnorm(int argc, char *argv[]); int main_vcfgtcheck(int argc, char *argv[]); int main_vcfview(int argc, char *argv[]); +int main_vcfhead(int argc, char *argv[]); int main_vcfcall(int argc, char *argv[]); int main_vcfannotate(int argc, char *argv[]); int main_vcfroh(int argc, char *argv[]); @@ -55,6 +56,7 @@ int main_polysomy(int argc, char *argv[]); #endif #ifdef ENABLE_BCF_PLUGINS int main_plugin(int argc, char *argv[]); +int count_plugins(void); #endif int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); @@ -100,6 +102,10 @@ static cmd_t cmds[] = .alias = "convert", .help = "convert VCF/BCF files to different formats and back" }, + { .func = main_vcfhead, + .alias = "head", + .help = "view VCF/BCF file headers" + }, { .func = main_vcfisec, .alias = "isec", .help = "intersections of VCF/BCF files" @@ -225,6 +231,14 @@ static void usage(FILE *fp) if ( cmds[i].func && cmds[i].help[0]!='-' ) fprintf(fp, " %-12s %s\n", cmds[i].alias, cmds[i].help); i++; } +#if ENABLE_BCF_PLUGINS + fprintf(fp,"\n -- Plugins (collection of programs for calling, file manipulation & analysis)\n"); + int nplugins = count_plugins(); + if ( nplugins ) + fprintf(fp," %d plugins available, run \"bcftools plugin -lv\" to see a complete list\n", nplugins); + else + fprintf(fp," 0 plugins available, run \"bcftools plugin -l\" for help\n"); +#endif fprintf(fp,"\n"); fprintf(fp, " Most commands accept VCF, bgzipped VCF, and BCF with the file type detected\n" @@ -251,7 +265,7 @@ int main(int argc, char *argv[]) if (argc < 2) { usage(stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version()); + printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2022 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL printf("License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/mpileup.c b/mpileup.c index eb0cc6425..fd5aa510e 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2021 Genome Research Ltd. + Copyright (C) 2008-2022 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -69,7 +69,7 @@ typedef struct _mplp_pileup_t mplp_pileup_t; typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, max_indel_depth, max_read_len, fmt_flag, ambig_reads; - int rflag_require, rflag_filter, output_type; + int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels double indel_bias; @@ -197,8 +197,10 @@ static int mplp_func(void *data, bam1_t *b) // The 'B' cigar operation is not part of the specification, considering as obsolete. // bam_remove_B(b); if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads - if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue; - if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue; + if (ma->conf->rflag_skip_any_unset && (ma->conf->rflag_skip_any_unset&b->core.flag)!=ma->conf->rflag_skip_any_unset) continue; + if (ma->conf->rflag_skip_all_set && (ma->conf->rflag_skip_all_set&b->core.flag)==ma->conf->rflag_skip_all_set) continue; + if (ma->conf->rflag_skip_all_unset && !(ma->conf->rflag_skip_all_unset&b->core.flag)) continue; + if (ma->conf->rflag_skip_any_set && ma->conf->rflag_skip_any_set&b->core.flag) continue; if (ma->conf->bed) { // test overlap @@ -1087,8 +1089,10 @@ static void list_annotations(FILE *fp) static void print_usage(FILE *fp, const mplp_conf_t *mplp) { - char *tmp_require = bam_flag2str(mplp->rflag_require); - char *tmp_filter = bam_flag2str(mplp->rflag_filter); + char *tmp_skip_all_set = bam_flag2str(mplp->rflag_skip_all_set); + char *tmp_skip_any_unset = bam_flag2str(mplp->rflag_skip_any_unset); + char *tmp_skip_all_unset = bam_flag2str(mplp->rflag_skip_all_unset); + char *tmp_skip_any_set = bam_flag2str(mplp->rflag_skip_any_set); // Display usage information, formatted for the standard 80 columns. // (The unusual string formatting here aids the readability of this @@ -1122,10 +1126,12 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" " -R, --regions-file FILE Restrict to regions listed in a file\n" " --ignore-RG Ignore RG tags (one BAM = one sample)\n" - " --rf, --incl-flags STR|INT Required flags: skip reads with mask bits unset [%s]\n", tmp_require); + " --ls, --skip-all-set STR|INT Skip reads with all of the bits set []\n"); fprintf(fp, - " --ff, --excl-flags STR|INT Filter flags: skip reads with mask bits set\n" - " [%s]\n", tmp_filter); + " --ns, --skip-any-set STR|INT Skip reads with any of the bits set [%s]\n", tmp_skip_any_set); + fprintf(fp, + " --lu, --skip-all-unset STR|INT Skip reads with all of the bits unset []\n" + " --nu, --skip-any-unset STR|INT Skip reads with any of the bits unset []\n"); fprintf(fp, " -s, --samples LIST Comma separated list of samples to include\n" " -S, --samples-file FILE File of samples to include\n" @@ -1184,8 +1190,10 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" "\n"); - free(tmp_require); - free(tmp_filter); + free(tmp_skip_all_set); + free(tmp_skip_any_unset); + free(tmp_skip_all_unset); + free(tmp_skip_any_set); } int main_mpileup(int argc, char *argv[]) @@ -1206,7 +1214,7 @@ int main_mpileup(int argc, char *argv[]) mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; - mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + mplp.rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; mplp.output_type = FT_VCF; mplp.record_cmd_line = 1; @@ -1222,10 +1230,16 @@ int main_mpileup(int argc, char *argv[]) static const struct option lopts[] = { - {"rf", required_argument, NULL, 1}, // require flag - {"ff", required_argument, NULL, 2}, // filter flag - {"incl-flags", required_argument, NULL, 1}, - {"excl-flags", required_argument, NULL, 2}, + {"nu", required_argument, NULL, 16}, + {"lu", required_argument, NULL, 17}, + {"rf", required_argument, NULL, 17}, // old --rf, --incl-flags = --lu, --skip-all-unset + {"ns", required_argument, NULL, 18}, + {"ff", required_argument, NULL, 18}, // old --ff, --excl-flags = --ns, --skip-any-set + {"ls", required_argument, NULL, 19}, + {"skip-any-unset", required_argument, NULL, 16}, + {"skip-all-unset", required_argument, NULL, 17}, + {"skip-any-set", required_argument, NULL, 18}, + {"skip-all-set", required_argument, NULL, 19}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, {"ignore-RG", no_argument, NULL, 5}, @@ -1287,13 +1301,21 @@ int main_mpileup(int argc, char *argv[]) while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; - case 1 : - mplp.rflag_require = bam_str2flag(optarg); - if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } + case 16 : + mplp.rflag_skip_any_unset = bam_str2flag(optarg); + if ( mplp.rflag_skip_any_unset <0 ) { fprintf(stderr,"Could not parse --nf %s\n", optarg); return 1; } + break; + case 17 : + mplp.rflag_skip_all_unset = bam_str2flag(optarg); + if ( mplp.rflag_skip_all_unset<0 ) { fprintf(stderr,"Could not parse --if %s\n", optarg); return 1; } + break; + case 18 : + mplp.rflag_skip_any_set = bam_str2flag(optarg); + if ( mplp.rflag_skip_any_set <0 ) { fprintf(stderr,"Could not parse --ef %s\n", optarg); return 1; } break; - case 2 : - mplp.rflag_filter = bam_str2flag(optarg); - if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } + case 19 : + mplp.rflag_skip_all_set = bam_str2flag(optarg); + if ( mplp.rflag_skip_all_set <0 ) { fprintf(stderr,"Could not parse --df %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; diff --git a/plugins/contrast.c b/plugins/contrast.c index 8291092d0..71d9d3d45 100644 --- a/plugins/contrast.c +++ b/plugins/contrast.c @@ -515,16 +515,12 @@ int run(int argc, char **argv) } break; case 3 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c index 69eda4ac8..414808214 100644 --- a/plugins/fill-tags.c +++ b/plugins/fill-tags.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2015-2021 Genome Research Ltd. + Copyright (c) 2015-2022 Genome Research Ltd. Author: Petr Danecek @@ -509,8 +509,8 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) if ( args->tags & SET_END ) bcf_hdr_printf(args->out_hdr, "##INFO="); if ( args->tags & SET_TYPE ) bcf_hdr_printf(args->out_hdr, "##INFO="); if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); - if ( args->tags & SET_VAF ) bcf_hdr_append(args->out_hdr, "##FORMAT=tags & SET_VAF1 ) bcf_hdr_append(args->out_hdr, "##FORMAT=tags & SET_VAF ) bcf_hdr_append(args->out_hdr, "##FORMAT="); + if ( args->tags & SET_VAF1 ) bcf_hdr_append(args->out_hdr, "##FORMAT="); return 0; } diff --git a/plugins/fixref.c b/plugins/fixref.c index 16f6110d5..94c2419f8 100644 --- a/plugins/fixref.c +++ b/plugins/fixref.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2016-2019 Genome Research Ltd. + Copyright (c) 2016-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -93,6 +93,7 @@ #define MODE_FLIP2FWD 3 #define MODE_USE_ID 4 #define MODE_REF_ALT 5 +#define MODE_FLIP_ALL 6 typedef struct { @@ -101,7 +102,7 @@ typedef struct } marker_t; -KHASH_MAP_INIT_INT(i2m, marker_t) +KHASH_MAP_INIT_STR(i2m, marker_t) typedef khash_t(i2m) i2m_t; typedef struct @@ -127,15 +128,16 @@ const char *about(void) const char *usage(void) { - return + return "\n" "About: This tool helps to determine and fix strand orientation.\n" " Currently the following modes are recognised:\n" - " flip .. flip REF/ALT columns and GTs for non-ambiguous SNPs and ignore the rest\n" - " id .. swap REF/ALT columns and GTs using the ID column to determine the REF allele\n" - " ref-alt .. swap REF/ALT columns to match the reference but not modify the genotypes\n" - " stats .. collect and print stats\n" - " top .. convert from Illumina TOP strand to fwd\n" + " flip .. flip REF/ALT columns and GTs for non-ambiguous SNPs and ignore the rest\n" + " flip-all .. flip REF/ALT columns and GTs for all SNPs, including ambiguous (A/T, C/G) sites\n" + " id .. swap REF/ALT columns and GTs using the ID column to determine the REF allele\n" + " ref-alt .. swap REF/ALT columns to match the reference but not modify the genotypes\n" + " stats .. collect and print stats\n" + " top .. convert from Illumina TOP strand to fwd\n" "\n" " WARNING: Do not use the program blindly, make an effort to\n" " understand what strand convention your data uses! Make sure\n" @@ -190,14 +192,15 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) int c; while ((c = getopt_long(argc, argv, "?hf:m:di:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { - case 'm': - if ( !strcasecmp(optarg,"top") ) args.mode = MODE_TOP2FWD; - else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD; - else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID; - else if ( !strcasecmp(optarg,"ref-alt") ) args.mode = MODE_REF_ALT; - else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS; + case 'm': + if ( !strcasecmp(optarg,"top") ) args.mode = MODE_TOP2FWD; + else if ( !strcasecmp(optarg,"flip") ) args.mode = MODE_FLIP2FWD; + else if ( !strcasecmp(optarg,"flip-all") ) args.mode = MODE_FLIP_ALL; + else if ( !strcasecmp(optarg,"id") ) args.mode = MODE_USE_ID; + else if ( !strcasecmp(optarg,"ref-alt") ) args.mode = MODE_REF_ALT; + else if ( !strcasecmp(optarg,"stats") ) args.mode = MODE_STATS; else error("The source strand convention not recognised: %s\n", optarg); break; case 'i': args.dbsnp_fname = optarg; args.mode = MODE_USE_ID; break; @@ -241,7 +244,7 @@ static bcf1_t *set_ref_alt(args_t *args, bcf1_t *rec, const char ref, const char } } bcf_update_genotypes(args->hdr,rec,args->gts,args->ngts); - + return rec; } @@ -257,21 +260,6 @@ static inline int nt2int(char nt) #define int2nt(x) "ACGT"[x] #define revint(x) ("3210"[x]-'0') -static inline uint32_t parse_rsid(char *name) -{ - if ( name[0]!='r' || name[1]!='s' ) - { - name = strstr(name, "rs"); - if ( !name ) return 0; - } - char *tmp; - name += 2; - uint64_t id = strtol(name, &tmp, 10); - if ( tmp==name || *tmp ) return 0; - if ( id > UINT32_MAX ) error("FIXME: the ID is too big for uint32_t: %s\n", name-2); - return id; -} - static int fetch_ref(args_t *args, bcf1_t *rec) { // Get the reference allele @@ -292,9 +280,17 @@ static int fetch_ref(args_t *args, bcf1_t *rec) return ir; } +static void dbsnp_destroy(args_t *args) +{ + if ( !args->i2m ) return; + khint_t k; + for (k = 0; k < kh_end(args->i2m); ++k) + if (kh_exist(args->i2m, k)) free((char*)kh_key(args->i2m, k)); + kh_destroy(i2m, args->i2m); +} static void dbsnp_init(args_t *args, const char *chr) { - if ( args->i2m ) kh_destroy(i2m, args->i2m); + dbsnp_destroy(args); args->i2m = kh_init(i2m); bcf_srs_t *sr = bcf_sr_init(); if ( bcf_sr_set_regions(sr, chr, 0) != 0 ) goto done; @@ -308,13 +304,13 @@ static void dbsnp_init(args_t *args, const char *chr) int ref = nt2int(rec->d.allele[0][0]); if ( ref<0 ) continue; // non-[ACGT] base - uint32_t id = parse_rsid(rec->d.id); - if ( !id ) continue; + if ( !rec->d.id || (rec->d.id[0]=='.' && !rec->d.id[1]) ) continue; + char *id = strdup(rec->d.id); int ret, k; k = kh_put(i2m, args->i2m, id, &ret); - if ( ret<0 ) error("An error occurred while inserting the key %u\n", id); - if ( ret==0 ) continue; // skip ambiguous id + if ( ret<0 ) error("An error occurred while inserting the key \"%s\"\n", id); + if ( ret==0 ) { free(id); continue; } // skip ambiguous id kh_val(args->i2m, k).pos = (uint32_t)rec->pos; kh_val(args->i2m, k).ref = ref; } @@ -325,14 +321,14 @@ static void dbsnp_init(args_t *args, const char *chr) static bcf1_t *dbsnp_check(args_t *args, bcf1_t *rec, int ir, int ia, int ib) { int k, ref,pos; - uint32_t id = parse_rsid(rec->d.id); - if ( !id ) goto no_info; + char *id = rec->d.id; + if ( !id || (id[0]=='.' && !id[1]) ) goto no_info; k = kh_get(i2m, args->i2m, id); if ( k==kh_end(args->i2m) ) goto no_info; pos = (int)kh_val(args->i2m, k).pos; - if ( pos != rec->pos ) + if ( pos != rec->pos ) { rec->pos = pos; ir = fetch_ref(args, rec); @@ -340,7 +336,7 @@ static bcf1_t *dbsnp_check(args_t *args, bcf1_t *rec, int ir, int ia, int ib) } ref = kh_val(args->i2m, k).ref; - if ( ref!=ir ) + if ( ref!=ir ) error("Reference base mismatch at %s:%"PRId64" .. %c vs %c\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,int2nt(ref),int2nt(ir)); if ( ia==ref ) return rec; @@ -435,10 +431,10 @@ bcf1_t *process(bcf1_t *rec) if ( ir==revint(ib) ) { args.nflip_swap++; return set_ref_alt(&args,rec,int2nt(revint(ib)),int2nt(revint(ia)),0); } error("FIXME: this should not happen %s:%"PRId64"\n", bcf_seqname(args.hdr,rec),(int64_t) rec->pos+1); } - else if ( args.mode==MODE_FLIP2FWD ) + else if ( args.mode==MODE_FLIP2FWD || args.mode==MODE_FLIP_ALL ) { int pair = 1 << ia | 1 << ib; - if ( pair==0x9 || pair==0x6 ) // skip ambiguous pairs: A/T or C/G + if ( args.mode==MODE_FLIP2FWD && (pair==0x9 || pair==0x6) ) // skip ambiguous pairs: A/T or C/G { args.nunresolved++; return args.discard ? NULL : ret; @@ -491,7 +487,7 @@ bcf1_t *process(bcf1_t *rec) break; } free(ref); - + if ( strand==1 ) { if ( ir==ia ) return ret; @@ -524,15 +520,15 @@ bcf1_t *process(bcf1_t *rec) return ret; } -int top_mask[4][4] = -{ +int top_mask[4][4] = +{ {0,1,1,1}, {0,0,1,0}, {0,0,0,0}, {0,0,0,0}, }; -int bot_mask[4][4] = -{ +int bot_mask[4][4] = +{ {0,0,0,0}, {0,0,0,0}, {0,1,0,0}, @@ -587,5 +583,5 @@ void destroy(void) free(args.gts); if ( args.fai ) fai_destroy(args.fai); - if ( args.i2m ) kh_destroy(i2m, args.i2m); + dbsnp_destroy(&args); } diff --git a/plugins/mendelian.c b/plugins/mendelian.c index 6c3befbe6..3f5128448 100644 --- a/plugins/mendelian.c +++ b/plugins/mendelian.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2015-2021 Genome Research Ltd. + Copyright (c) 2015-2022 Genome Research Ltd. Author: Petr Danecek @@ -487,6 +487,7 @@ int run(int argc, char **argv) if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); + if ( !list ) error("Error: could not read file %s\n",trio_file); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; iregion ) { args->sr->require_index = 1; + bcf_sr_set_opt(args->sr, BCF_SR_REGIONS_OVERLAP, args->regions_overlap); if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n", args->region); } - if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) - error("Failed to read the targets: %s\n", args->target); + if ( args->target ) + { + bcf_sr_set_opt(args->sr, BCF_SR_TARGETS_OVERLAP, args->targets_overlap); + if ( bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) + error("Failed to read the targets: %s\n", args->target); + } if ( bcf_sr_set_threads(args->sr, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->sr, args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); @@ -285,7 +293,7 @@ static void process(args_t *args) args->chunk_cnt++; } } else { - if ( regidx_overlap(args->reg_idx, bcf_hdr_id2name(hdr, rec->rid), rec->pos, rec->pos + rec->rlen-1, args->reg_itr) ) { + if ( regidx_overlap(args->reg_idx, bcf_hdr_id2name(hdr, rec->rid), rec->pos, rec->pos, args->reg_itr) ) { while (regitr_overlap(args->reg_itr)) { int idx = regitr_payload(args->reg_itr, int); set = &args->sets[idx]; @@ -304,6 +312,8 @@ int run(int argc, char **argv) args->argc = argc; args->argv = argv; args->output_type = FT_VCF; args->record_cmd_line = 1; + args->regions_overlap = 1; + args->targets_overlap = 0; args->clevel = -1; static struct option loptions[] = { @@ -315,14 +325,16 @@ int run(int argc, char **argv) {"threads",required_argument,NULL,2}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, - {"targets", required_argument, NULL, 't'}, - {"targets-file", required_argument, NULL, 'T'}, + {"regions-overlap",required_argument,NULL,3}, + {"targets", required_argument, NULL,'t'}, + {"targets-file", required_argument, NULL,'T'}, + {"targets-overlap",required_argument,NULL,4}, {"nsites-per-chunk",required_argument,NULL,'n'}, {"scatter",required_argument,NULL,'s'}, {"scatter-file",required_argument,NULL,'S'}, {"extra",required_argument,NULL,'x'}, {"prefix",required_argument,NULL,'p'}, - {"hts-opts",required_argument,NULL,3}, + {"hts-opts",required_argument,NULL,5}, {NULL,0,NULL,0} }; int c; @@ -340,28 +352,36 @@ int run(int argc, char **argv) case 1 : args->record_cmd_line = 0; break; case 'o': args->output_dir = optarg; break; case 'O': - switch (optarg[0]) { - case 'b': args->output_type = FT_BCF_GZ; break; - case 'u': args->output_type = FT_BCF; break; - case 'z': args->output_type = FT_VCF_GZ; break; - case 'v': args->output_type = FT_VCF; break; - default: - { - args->clevel = strtol(optarg,&tmp,10); - if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); - } - } - if ( optarg[1] ) - { - args->clevel = strtol(optarg+1,&tmp,10); - if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); - } - break; + switch (optarg[0]) { + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: + { + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --compression-level %s\n", optarg+1); + } + break; case 2 : args->n_threads = strtol(optarg, 0, 0); break; case 'r': args->region = optarg; break; case 'R': args->region = optarg; args->region_is_file = 1; break; + case 3 : + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); + break; case 't': args->target = optarg; break; case 'T': args->target = optarg; args->target_is_file = 1; break; + case 4 : + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); + break; case 'n': args->nsites = strtod(optarg, &tmp); if ( tmp==optarg || *tmp ) error("Could not parse: --nsites-per-chunk %s\n", optarg); @@ -371,7 +391,7 @@ int run(int argc, char **argv) case 'S': args->scatter = optarg; args->scatter_is_file = 1; break; case 'x': args->extra = optarg; break; case 'p': args->prefix = optarg; break; - case 3 : args->hts_opts = hts_readlist(optarg, 0, &args->nhts_opts); break; + case 5 : args->hts_opts = hts_readlist(optarg, 0, &args->nhts_opts); break; case 'h': case '?': default: error("%s", usage_text()); break; diff --git a/plugins/setGT.c b/plugins/setGT.c index 27aa146fd..2de21102c 100644 --- a/plugins/setGT.c +++ b/plugins/setGT.c @@ -487,7 +487,7 @@ bcf1_t *process(bcf1_t *rec) else if ( args->tgt_mask>_QUERY ) { int pass_site = filter_test(args->filter,rec,(const uint8_t **)&args->smpl_pass); - if ( pass_site && args->filter_logic==FLT_EXCLUDE ) + if ( args->filter_logic==FLT_EXCLUDE ) { // -i can include a site but exclude a sample, -e exclude a site but include a sample if ( pass_site ) diff --git a/plugins/split-vep.c b/plugins/split-vep.c index 9ad537a02..86fa92148 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -1120,16 +1120,12 @@ int run(int argc, char **argv) } break; case 3 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': diff --git a/plugins/split.c b/plugins/split.c index a56265683..a362e0ed9 100644 --- a/plugins/split.c +++ b/plugins/split.c @@ -683,16 +683,12 @@ int run(int argc, char **argv) } break; case 2 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 08d4683a1..a23bc01ca 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -953,12 +953,13 @@ static double process_trio_DNG(args_t *args, priors_t *priors, int nals, double #endif return log2phred(subtract_log(0,max-sum)); } -static int process_trio_naive(args_t *args, priors_t *priors, int nals, int32_t gts[3]) +static int process_trio_naive(args_t *args, priors_t *priors, int nals, int32_t gts[3], int *denovo_allele) { int fi = seq3[gts[iFATHER]]; int mi = seq3[gts[iMOTHER]]; int ci = seq3[gts[iCHILD]]; assert( fi!=-1 && mi!=-1 && ci!=-1 ); + *denovo_allele = priors->denovo_allele[fi][mi][ci]; return priors->denovo[fi][mi][ci]; } static int test_filters(args_t *args, bcf1_t *rec) @@ -1158,7 +1159,7 @@ static void set_trio_QS_noisy(args_t *args, trio_t *trio, double *pqs[3], int nq #endif } } -static int set_trio_GT(args_t *args, trio_t *trio, int32_t gts[3], int ngts) +static int set_trio_GT(args_t *args, trio_t *trio, int32_t gts[3], int ngts, int ignore_father) { int j,k; for (j=0; j<3; j++) // iFATHER,iMOTHER,iCHILD @@ -1166,16 +1167,23 @@ static int set_trio_GT(args_t *args, trio_t *trio, int32_t gts[3], int ngts) int32_t *src = args->gt + ngts * trio->idx[j]; for (k=0; k0 && gts[j]<13); } - if ( !gts[j] ) return -1; + if ( !gts[j] && (j!=iFATHER || !ignore_father) ) return -1; } return 0; } -static int set_trio_GT_many_alts(args_t *args, trio_t *trio, int32_t gts[3], int ngts, int nals) +static int set_trio_GT_many_alts(args_t *args, trio_t *trio, int32_t gts[3], int ngts, int nals, int ignore_father) { int i,j,k, nused = 0; hts_expand(int,nals,args->malt_idx,args->alt_idx); @@ -1185,9 +1193,15 @@ static int set_trio_GT_many_alts(args_t *args, trio_t *trio, int32_t gts[3], int int32_t *src = args->gt + ngts * trio->idx[j]; for (k=0; k= nals ) error("Error: FMT/GT contains incorrect allele \"%d\" at a site with %d alleles\n",ial,nals); if ( args->alt_idx[ial]==-1 ) { @@ -1219,21 +1233,25 @@ static void process_record_naive(args_t *args, bcf1_t *rec) int i, write_dnm = 0; for (i=0; idnm_qual_int[i] = bcf_int32_missing; + for (i=0; idnm_allele[i] = bcf_int32_missing; for (i=0; intrio; i++) { if ( args->filter && !args->trio[i].pass ) continue; - int32_t gts[3] = {0,0,0}; - int ret = rec->n_allele<=4 ? set_trio_GT(args,&args->trio[i],gts,ngts) : set_trio_GT_many_alts(args,&args->trio[i],gts,ngts,rec->n_allele); - if ( ret<0 ) continue; - + int ignore_father = 0; // father is irrelevant for male proband on chrX and can have missing GT priors_t *priors; if ( !is_chrX ) priors = &args->priors; - else if ( args->trio[i].is_male ) priors = &args->priors_X; + else if ( args->trio[i].is_male ) priors = &args->priors_X, ignore_father = 1; else priors = &args->priors_XX; - double is_dnm = process_trio_naive(args, priors, rec->n_allele, gts); + int32_t gts[3] = {0,0,0}; + int ret = rec->n_allele<=4 ? set_trio_GT(args,&args->trio[i],gts,ngts,ignore_father) : set_trio_GT_many_alts(args,&args->trio[i],gts,ngts,rec->n_allele,ignore_father); + if ( ret<0 ) continue; + + int dnm_allele; + double is_dnm = process_trio_naive(args, priors, rec->n_allele, gts, &dnm_allele); args->dnm_qual_int[ args->trio[i].idx[iCHILD] ] = is_dnm; + args->dnm_allele[ args->trio[i].idx[iCHILD] ] = dnm_allele; if ( is_dnm ) write_dnm = 1; } if ( write_dnm ) @@ -1241,8 +1259,12 @@ static void process_record_naive(args_t *args, bcf1_t *rec) int ret = bcf_update_format_int32(args->hdr_out,rec,args->dnm_score_tag,args->dnm_qual_int,nsmpl); if ( ret ) error("Failed to write FORMAT/%s at %s:%"PRId64"\n", args->dnm_score_tag, bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + ret = bcf_update_format_int32(args->hdr_out,rec,args->dnm_allele_tag,args->dnm_allele,nsmpl); + if ( ret ) + error("Failed to write FORMAT/%s at %s:%"PRId64"\n", args->dnm_allele_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } - if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); + if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) + error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); } static void process_record(args_t *args, bcf1_t *rec) { @@ -1571,16 +1593,12 @@ int run(int argc, char **argv) case 12 : args->record_cmd_line = 0; break; case 13 : args->with_pad = 1; break; case 14 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 15 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'X': args->chrX_list_str = optarg; break; case 'u': set_option(args,optarg); break; diff --git a/polysomy.c b/polysomy.c index 943515b8b..1a99f98af 100644 --- a/polysomy.c +++ b/polysomy.c @@ -693,16 +693,12 @@ int main_polysomy(int argc, char *argv[]) case 1 : args->ra_rr_scaling = 0; break; case 2 : args->force_cn = atoi(optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'n': args->nbins = atoi(optarg); break; case 'S': args->smooth = atoi(optarg); break; diff --git a/reheader.c b/reheader.c index ae7c6226e..4458f27bc 100644 --- a/reheader.c +++ b/reheader.c @@ -1,6 +1,6 @@ /* reheader.c -- reheader subcommand. - Copyright (C) 2014-2021 Genome Research Ltd. + Copyright (C) 2014-2022 Genome Research Ltd. Author: Petr Danecek @@ -142,14 +142,16 @@ static char *copy_and_update_contig_line(faidx_t *fai, char *line, void *chr_see } char *init_tmp_prefix(const char *tmp_prefix) { - char *prefix = NULL; + kstring_t prefix = {0,0,0}; if ( tmp_prefix ) { - int len = strlen(tmp_prefix); - prefix = (char*) calloc(len+7,1); - memcpy(prefix,tmp_prefix,len); - memcpy(prefix+len,"XXXXXX",6); + ksprintf(&prefix,"%sXXXXXX",tmp_prefix); + return prefix.s; } + + char *tmpdir = getenv("TMPDIR"); + if ( tmpdir ) + kputs(tmpdir, &prefix); else { #ifdef _WIN32 @@ -157,15 +159,13 @@ char *init_tmp_prefix(const char *tmp_prefix) int ret = GetTempPath(MAX_PATH, tmp_path); if (!ret || ret > MAX_PATH) error("Could not get the path to the temporary folder\n"); - if (strlen(tmp_path) + strlen("/bcftools.XXXXXX") >= MAX_PATH) - error("Full path to the temporary folder is too long\n"); - strcat(tmp_path, "/bcftools.XXXXXX"); - prefix = strdup(tmp_path); + kputs(tmp_path, &prefix); #else - prefix = strdup("/tmp/bcftools.XXXXXX"); + kputs("/tmp", &prefix); #endif } - return prefix; + kputs("/bcftools.XXXXXX", &prefix); + return prefix.s; } static void update_from_fai(args_t *args) { diff --git a/smpl_ilist.c b/smpl_ilist.c index d170db5ac..e3fbaccfb 100644 --- a/smpl_ilist.c +++ b/smpl_ilist.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016, 2018 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -63,7 +63,10 @@ smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, in char **list = hts_readlist(negate?sample_list+1:sample_list, is_file, &nlist); if ( !list ) error("Could not parse %s\n", sample_list); - // preserve the VCF order + if ( negate && (flags&SMPL_REORDER) ) flags &= ~SMPL_REORDER; + + // preserve the VCF order unless flags&SMPL_REORDER is set + int j = 0; int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int)); char **pair = NULL; for (i=0; in++; } + if ( flags & SMPL_REORDER ) + { + smpl->idx = tmp; + for (i=0; in = bcf_hdr_nsamples(hdr) - smpl->n; smpl->idx = (int*) malloc(sizeof(int)*smpl->n); - int j = 0; + j = 0; if ( !negate ) { if ( pair ) smpl->pair = (char**) calloc(smpl->n,sizeof(char*)); diff --git a/smpl_ilist.h b/smpl_ilist.h index 23a0e539f..79292c3ea 100644 --- a/smpl_ilist.h +++ b/smpl_ilist.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -36,6 +36,7 @@ #define SMPL_PAIR1 4 // two samples expected, the first is from the bcf hdr #define SMPL_PAIR2 8 // two samples expected, the second is from the bcf hdr #define SMPL_VERBOSE 16 // print warnings +#define SMPL_REORDER 32 // reorder samples as asked, sample_list[i] points to the VCF header index typedef struct { diff --git a/test/annotate.id.1.out b/test/annotate.id.1.out new file mode 100644 index 000000000..2fe40a802 --- /dev/null +++ b/test/annotate.id.1.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.1 +##FILTER= +##FORMAT= +##contig= +##reference=ref.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 50 id50 C G . . . GT 0/0 +1 51 id51 C A . . . GT 0/0 +1 52 id52 C T . . . GT 0/0 +1 53 id53 C T . . . GT 0/1 diff --git a/test/annotate.id.2.out b/test/annotate.id.2.out new file mode 100644 index 000000000..580f628bb --- /dev/null +++ b/test/annotate.id.2.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.1 +##FILTER= +##FORMAT= +##contig= +##reference=ref.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 50 id50 C G . . . GT 0/0 +1 51 id51 C A . . . GT 0/0 +1 52 id52 C T . . . GT 0/0 +1 53 id53 C A . . . GT 0/1 diff --git a/test/annotate.id.vcf b/test/annotate.id.vcf new file mode 100644 index 000000000..7496923be --- /dev/null +++ b/test/annotate.id.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##FORMAT= +##contig= +##reference=ref.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 50 id50 C . . . . GT 0/0 +1 51 id51 C . . . . GT 0/0 +1 52 id52 C . . . . GT 0/0 +1 53 id53 C A . . . GT 0/1 diff --git a/test/annotate.olap.1.out b/test/annotate.olap.1.out new file mode 100644 index 000000000..ef7272439 --- /dev/null +++ b/test/annotate.olap.1.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##ALT= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10 . C . . END=19;DB=cnv10_15,cnv18_19 +1 20 . C . . END=30;DB=cnv20_50 diff --git a/test/annotate.olap.2.out b/test/annotate.olap.2.out new file mode 100644 index 000000000..eab0ef4fc --- /dev/null +++ b/test/annotate.olap.2.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##ALT= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10 . C . . END=19;DB=cnv10_15 +1 20 . C . . END=30 diff --git a/test/annotate.olap.vcf b/test/annotate.olap.vcf new file mode 100644 index 000000000..6e23e7f7a --- /dev/null +++ b/test/annotate.olap.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##reference=ref.fa +##contig= +##ALT= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10 . C . . END=19 +1 20 . C . . END=30 diff --git a/test/annots.id.vcf b/test/annots.id.vcf new file mode 100644 index 000000000..fb6643e90 --- /dev/null +++ b/test/annots.id.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.1 +##FORMAT= +##INFO= +##contig= +##reference=ref.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 50 id50 C G . . . GT 0/0 +1 51 id51 C A . . . GT 0/0 +1 52 id52 C T . . . GT 0/0 +1 53 id53_T C T . . . GT 0/1 diff --git a/test/annots.olap.tab b/test/annots.olap.tab new file mode 100644 index 000000000..2c60ad6ad --- /dev/null +++ b/test/annots.olap.tab @@ -0,0 +1,3 @@ +1 10 15 cnv10_15 +1 18 19 cnv18_19 +1 20 50 cnv20_50 diff --git a/test/check.gs.chrom.gen b/test/check.gs.chrom.gen index 740602843..adc10bc1f 100644 --- a/test/check.gs.chrom.gen +++ b/test/check.gs.chrom.gen @@ -1,10 +1,10 @@ -1 1:3062915_GTTT_G 3062915 GTTT G 0 1 0 0 1 0 -1 1:3106154_CAAA_C 3106154 CAAA C 0 1 0 0 1 0 -1 1:3157410_G_A 3157410 G A 0 0 1 0 0 1 -1 1:3162006_G_A 3162006 G A 0 0 1 0 1 0 -1 1:3177144_GT_G 3177144 GT G 0 1 0 0 1 0 -4 4:3258448_TACACACAC_T 3258448 TACACACAC T 0 1 0 0 1 0 -4 4:3258451_AAA_AGT 3258451 AAA AGT 0 1 0 0 1 0 -4 4:3258452_AAA_AGA 3258452 AAA AGA 0 1 0 0 1 0 -4 4:3258453_AACA_AGA 3258453 AACA AGA 0 1 0 0 1 0 -4 4:3258454_AACA_AACA 3258454 AACA AACA 0 1 0 0 1 0 +1 1:3062915_GTTT_G 1:3062915_GTTT_G 3062915 GTTT G 0 1 0 0 1 0 +1 1:3106154_CAAA_C 1:3106154_CAAA_C 3106154 CAAA C 0 1 0 0 1 0 +1 1:3157410_G_A 1:3157410_G_A 3157410 G A 0 0 1 0 0 1 +1 1:3162006_G_A 1:3162006_G_A 3162006 G A 0 0 1 0 1 0 +1 1:3177144_GT_G 1:3177144_GT_G 3177144 GT G 0 1 0 0 1 0 +4 4:3258448_TACACACAC_T 4:3258448_TACACACAC_T 3258448 TACACACAC T 0 1 0 0 1 0 +4 4:3258451_AAA_AGT 4:3258451_AAA_AGT 3258451 AAA AGT 0 1 0 0 1 0 +4 4:3258452_AAA_AGA 4:3258452_AAA_AGA 3258452 AAA AGA 0 1 0 0 1 0 +4 4:3258453_AACA_AGA 4:3258453_AACA_AGA 3258453 AACA AGA 0 1 0 0 1 0 +4 4:3258454_AACA_AACA 4:3258454_AACA_AACA 3258454 AACA AACA 0 1 0 0 1 0 diff --git a/test/check.gs.vcfids_chrom.gen b/test/check.gs.vcfids_chrom.gen index 80c12be32..6b31ddf57 100644 --- a/test/check.gs.vcfids_chrom.gen +++ b/test/check.gs.vcfids_chrom.gen @@ -1,10 +1,10 @@ -1 id3D 3062915 GTTT G 0 1 0 0 1 0 -1 . 3106154 CAAA C 0 1 0 0 1 0 -1 . 3157410 G A 0 0 1 0 0 1 -1 . 3162006 G A 0 0 1 0 1 0 -1 . 3177144 GT G 0 1 0 0 1 0 -4 . 3258448 TACACACAC T 0 1 0 0 1 0 -4 . 3258451 AAA AGT 0 1 0 0 1 0 -4 . 3258452 AAA AGA 0 1 0 0 1 0 -4 . 3258453 AACA AGA 0 1 0 0 1 0 -4 . 3258454 AACA AACA 0 1 0 0 1 0 +1 1:3062915_GTTT_G id3D 3062915 GTTT G 0 1 0 0 1 0 +1 1:3106154_CAAA_C . 3106154 CAAA C 0 1 0 0 1 0 +1 1:3157410_G_A . 3157410 G A 0 0 1 0 0 1 +1 1:3162006_G_A . 3162006 G A 0 0 1 0 1 0 +1 1:3177144_GT_G . 3177144 GT G 0 1 0 0 1 0 +4 4:3258448_TACACACAC_T . 3258448 TACACACAC T 0 1 0 0 1 0 +4 4:3258451_AAA_AGT . 3258451 AAA AGT 0 1 0 0 1 0 +4 4:3258452_AAA_AGA . 3258452 AAA AGA 0 1 0 0 1 0 +4 4:3258453_AACA_AGA . 3258453 AACA AGA 0 1 0 0 1 0 +4 4:3258454_AACA_AACA . 3258454 AACA AACA 0 1 0 0 1 0 diff --git a/test/consensus.18.fa b/test/consensus.18.fa new file mode 100644 index 000000000..36998be2b --- /dev/null +++ b/test/consensus.18.fa @@ -0,0 +1,10 @@ +>1 +TACCATATGTGACATATAAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTTGC +CAGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCATT +AAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAAT +ATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCTC +TTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAAC +TTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAAG +GTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCTG +ATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAATC +TTTAAAAACAAAAAAAAAGAA diff --git a/test/consensus.18.vcf b/test/consensus.18.vcf new file mode 100644 index 000000000..1ac3726b9 --- /dev/null +++ b/test/consensus.18.vcf @@ -0,0 +1,19 @@ +##fileformat=VCFv4.2 +##FORMAT= +##reference=file://some/path/human_g1k_v37.fasta +##INFO= +##ALT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA001 +1 1 . T . . END=1 GT 0/0 +1 2 . A . . END=2 GT 0/0 +1 3 . C . . END=11 GT 0/0 +1 12 . A . . END=21 GT 0/0 +1 22 . A . . END=23 GT 0/0 +1 24 . G . . END=25 GT 0/0 +1 26 . A . . END=26 GT 0/0 +1 27 . C . . END=29 GT 0/0 +1 30 . A C, . . . GT 1/1 +1 31 . A . . END=31 GT 0/0 +1 32 . C . . END=43 GT 0/0 +1 44 . C . . END=48 GT 0/0 diff --git a/test/consensus.19.fa b/test/consensus.19.fa new file mode 100644 index 000000000..3306d902b --- /dev/null +++ b/test/consensus.19.fa @@ -0,0 +1,2 @@ +>chr19:1048895-1049005 +GCCAGTACGGGATCCCTGAACCATGGAATTTTCCTTTTCGGAGGAGCTACTGGTGCGGACCTCGGCCCCCCAAGAGTCCAGCCCCTTGCCCCACCCCGCTGGACCCAAAGG diff --git a/test/consensus.19.vcf b/test/consensus.19.vcf new file mode 100644 index 000000000..a78e9ebe9 --- /dev/null +++ b/test/consensus.19.vcf @@ -0,0 +1,16 @@ +##fileformat=VCFv4.2 +##reference=file://some/path/human_g1k_v37.fasta +##FORMAT= +##INFO= +##ALT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA001 +chr19 1048985 . C . . END=1048985 GT 0/0 +chr19 1048986 . C . . END=1048994 GT 0/0 +chr19 1048995 . G . . END=1049004 GT 0/0 +chr19 1049005 . G . . END=1049006 GT 0/0 +chr19 1049007 . T . . END=1049008 GT 0/0 +chr19 1049009 . A . . END=1049009 GT 0/0 +chr19 1049010 . G . . END=1049012 GT 0/0 +chr19 1049013 . A C, . . . GT 1/1 +chr19 1049014 . C . . END=1049014 GT 0/0 diff --git a/test/consensus18.1.out b/test/consensus18.1.out new file mode 100644 index 000000000..c98fd4c30 --- /dev/null +++ b/test/consensus18.1.out @@ -0,0 +1,10 @@ +>1 +TACCATATGTGACATATAAAAAAGAACATCACCTACGTATCAACTAAAGTGGTTGTTTGC +CAGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCATT +AAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAAT +ATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCTC +TTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAAC +TTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAAG +GTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCTG +ATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAATC +TTTAAAAACAAAAAAAAAGAA diff --git a/test/consensus19.1.out b/test/consensus19.1.out new file mode 100644 index 000000000..2281503b9 --- /dev/null +++ b/test/consensus19.1.out @@ -0,0 +1,3 @@ +>chr19:1048895-1049005 +GCCAGTACGGGATCCCTGAACCATGGAATTTTCCTTTTCGGAGGAGCTACTGGTGCGGAC +CTCGGCCCCCCAAGAGTCCAGCCCCTTGCCCCACCCCGCTGGACCCAAAGG diff --git a/test/convert.gs.gt.ids.3N6.gen b/test/convert.gs.gt.ids.3N6.gen new file mode 100644 index 000000000..385f0a955 --- /dev/null +++ b/test/convert.gs.gt.ids.3N6.gen @@ -0,0 +1,32 @@ +X X:2698560_G_A id9 2698560 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698630_A_G id10 2698630 A G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698758_CAA_C id11 2698758 CAA C 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698769_AAG_A id12 2698769 AAG A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698789_C_G id14 2698789 C G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698822_A_C id15 2698822 A C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698831_G_A id16 2698831 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698889_T_C id17 2698889 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698923_G_A id18 2698923 G A 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698953_A_AGG id19 2698953 A AGG 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698954_G_A id20 2698954 G A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +X X:2699002_C_A id21 2699002 C A 1 0 0 1 0 0 0.33 0.33 0.33 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699025_T_C id22 2699025 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699091_G_A id23 2699091 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699187_T_C id24 2699187 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699188_G_C id25 2699188 G C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699189_T_C id26 2699189 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699217_C_T id27 2699217 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699246_C_A id28 2699246 C A 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 +X X:2699275_T_G id29 2699275 T G 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699350_A_T id30 2699350 A T 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699360_T_C id31 2699360 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699450_A_C id32 2699450 A C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699507_T_C id33 2699507 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699555_C_A id34 2699555 C A 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 +X X:2699645_G_T id35 2699645 G T 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +X X:2699676_G_A id36 2699676 G A 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699728_C_T id37 2699728 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699775_C_A id38 2699775 C A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699898_C_CT id39 2699898 C CT 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699968_A_G id40 2699968 A G 0.5 0.0 0.5 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 +X X:2699970_T_C id41 2699970 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 diff --git a/test/convert.gs.gt.ids.gen b/test/convert.gs.gt.ids.gen new file mode 100644 index 000000000..4391a2251 --- /dev/null +++ b/test/convert.gs.gt.ids.gen @@ -0,0 +1,32 @@ +X:2698560_G_A id9 2698560 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698630_A_G id10 2698630 A G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698758_CAA_C id11 2698758 CAA C 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698769_AAG_A id12 2698769 AAG A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698789_C_G id14 2698789 C G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698822_A_C id15 2698822 A C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698831_G_A id16 2698831 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698889_T_C id17 2698889 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698923_G_A id18 2698923 G A 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698953_A_AGG id19 2698953 A AGG 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2698954_G_A id20 2698954 G A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +X:2699002_C_A id21 2699002 C A 1 0 0 1 0 0 0.33 0.33 0.33 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2699025_T_C id22 2699025 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2699091_G_A id23 2699091 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2699187_T_C id24 2699187 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699188_G_C id25 2699188 G C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699189_T_C id26 2699189 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699217_C_T id27 2699217 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2699246_C_A id28 2699246 C A 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 +X:2699275_T_G id29 2699275 T G 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699350_A_T id30 2699350 A T 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699360_T_C id31 2699360 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699450_A_C id32 2699450 A C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699507_T_C id33 2699507 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2699555_C_A id34 2699555 C A 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 +X:2699645_G_T id35 2699645 G T 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +X:2699676_G_A id36 2699676 G A 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699728_C_T id37 2699728 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2699775_C_A id38 2699775 C A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X:2699898_C_CT id39 2699898 C CT 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X:2699968_A_G id40 2699968 A G 0.5 0.0 0.5 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 +X:2699970_T_C id41 2699970 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 diff --git a/test/convert.gs.gt.ids.gen.rev b/test/convert.gs.gt.ids.gen.rev new file mode 100644 index 000000000..57880dfca --- /dev/null +++ b/test/convert.gs.gt.ids.gen.rev @@ -0,0 +1,32 @@ +id9 X:2698560_G_A 2698560 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id10 X:2698630_A_G 2698630 A G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id11 X:2698758_CAA_C 2698758 CAA C 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id12 X:2698769_AAG_A 2698769 AAG A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id14 X:2698789_C_G 2698789 C G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id15 X:2698822_A_C 2698822 A C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id16 X:2698831_G_A 2698831 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id17 X:2698889_T_C 2698889 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id18 X:2698923_G_A 2698923 G A 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id19 X:2698953_A_AGG 2698953 A AGG 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id20 X:2698954_G_A 2698954 G A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +id21 X:2699002_C_A 2699002 C A 1 0 0 1 0 0 0.33 0.33 0.33 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id22 X:2699025_T_C 2699025 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id23 X:2699091_G_A 2699091 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id24 X:2699187_T_C 2699187 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id25 X:2699188_G_C 2699188 G C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id26 X:2699189_T_C 2699189 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id27 X:2699217_C_T 2699217 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id28 X:2699246_C_A 2699246 C A 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 +id29 X:2699275_T_G 2699275 T G 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 +id30 X:2699350_A_T 2699350 A T 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id31 X:2699360_T_C 2699360 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id32 X:2699450_A_C 2699450 A C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id33 X:2699507_T_C 2699507 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id34 X:2699555_C_A 2699555 C A 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 +id35 X:2699645_G_T 2699645 G T 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +id36 X:2699676_G_A 2699676 G A 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id37 X:2699728_C_T 2699728 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id38 X:2699775_C_A 2699775 C A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +id39 X:2699898_C_CT 2699898 C CT 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +id40 X:2699968_A_G 2699968 A G 0.5 0.0 0.5 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 +id41 X:2699970_T_C 2699970 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 diff --git a/test/convert.gs.gt.ids.gen6 b/test/convert.gs.gt.ids.gen6 new file mode 100644 index 000000000..385f0a955 --- /dev/null +++ b/test/convert.gs.gt.ids.gen6 @@ -0,0 +1,32 @@ +X X:2698560_G_A id9 2698560 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698630_A_G id10 2698630 A G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698758_CAA_C id11 2698758 CAA C 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698769_AAG_A id12 2698769 AAG A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698789_C_G id14 2698789 C G 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698822_A_C id15 2698822 A C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698831_G_A id16 2698831 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698889_T_C id17 2698889 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698923_G_A id18 2698923 G A 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698953_A_AGG id19 2698953 A AGG 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2698954_G_A id20 2698954 G A 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +X X:2699002_C_A id21 2699002 C A 1 0 0 1 0 0 0.33 0.33 0.33 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699025_T_C id22 2699025 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699091_G_A id23 2699091 G A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699187_T_C id24 2699187 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699188_G_C id25 2699188 G C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699189_T_C id26 2699189 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699217_C_T id27 2699217 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699246_C_A id28 2699246 C A 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 0 +X X:2699275_T_G id29 2699275 T G 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699350_A_T id30 2699350 A T 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699360_T_C id31 2699360 T C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699450_A_C id32 2699450 A C 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699507_T_C id33 2699507 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699555_C_A id34 2699555 C A 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 +X X:2699645_G_T id35 2699645 G T 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 +X X:2699676_G_A id36 2699676 G A 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699728_C_T id37 2699728 C T 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699775_C_A id38 2699775 C A 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 +X X:2699898_C_CT id39 2699898 C CT 1 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 +X X:2699968_A_G id40 2699968 A G 0.5 0.0 0.5 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 +X X:2699970_T_C id41 2699970 T C 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 diff --git a/test/convert.gs.noids.vcf b/test/convert.gs.noids.vcf new file mode 100644 index 000000000..f31d84d9c --- /dev/null +++ b/test/convert.gs.noids.vcf @@ -0,0 +1,33 @@ +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 NA00004 NA00005 NA00006 NA00007 NA00008 NA00009 NA00010 +X 2698560 . G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698630 . A G . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698758 . CAA C . . . GT:GP 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698769 . AAG A . . . GT:GP 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698789 . C G . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698822 . A C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698831 . G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698889 . T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698923 . G A . . . GT:GP 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698953 . A AGG . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698954 . G A . . . GT:GP 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 +X 2699002 . C A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:0.33,0.33,0.33 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699025 . T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699091 . G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699187 . T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699188 . G C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699189 . T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699217 . C T . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699246 . C A . . . GT:GP 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 +X 2699275 . T G . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699350 . A T . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699360 . T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699450 . A C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699507 . T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699555 . C A . . . GT:GP 0/0:1,0,0 1/1:0,0,1 1/1:0,0,1 0/0:1,0,0 1/1:0,0,1 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 +X 2699645 . G T . . . GT:GP 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 +X 2699676 . G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699728 . C T . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699775 . C A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699898 . C CT . . . GT:GP 0/0:1,0,0 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699968 . A G . . . GT:GP 0/0:0.5,0,0.5 0/0:1,0,0 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 +X 2699970 . T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 diff --git a/test/convert.gs.vcf b/test/convert.gs.vcf new file mode 100644 index 000000000..150a6f0ac --- /dev/null +++ b/test/convert.gs.vcf @@ -0,0 +1,33 @@ +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 NA00004 NA00005 NA00006 NA00007 NA00008 NA00009 NA00010 +X 2698560 id9 G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698630 id10 A G . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698758 id11 CAA C . . . GT:GP 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698769 id12 AAG A . . . GT:GP 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698789 id14 C G . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698822 id15 A C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698831 id16 G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698889 id17 T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698923 id18 G A . . . GT:GP 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698953 id19 A AGG . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2698954 id20 G A . . . GT:GP 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 +X 2699002 id21 C A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:0.33,0.33,0.33 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699025 id22 T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699091 id23 G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699187 id24 T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699188 id25 G C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699189 id26 T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699217 id27 C T . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699246 id28 C A . . . GT:GP 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 +X 2699275 id29 T G . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 1/1:0,0,1 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699350 id30 A T . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699360 id31 T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699450 id32 A C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699507 id33 T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699555 id34 C A . . . GT:GP 0/0:1,0,0 1/1:0,0,1 1/1:0,0,1 0/0:1,0,0 1/1:0,0,1 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/1:0,1,0 +X 2699645 id35 G T . . . GT:GP 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 +X 2699676 id36 G A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699728 id37 C T . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699775 id38 C A . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 +X 2699898 id39 C CT . . . GT:GP 0/0:1,0,0 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 1/1:0,0,1 0/1:0,1,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 +X 2699968 id40 A G . . . GT:GP 0/0:0.5,0,0.5 0/0:1,0,0 0/0:1,0,0 1/1:0,0,1 0/0:1,0,0 0/1:0,1,0 0/0:1,0,0 0/0:1,0,0 0/1:0,1,0 0/1:0,1,0 +X 2699970 id41 T C . . . GT:GP 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 0/0:1,0,0 diff --git a/test/convert.gt.noHead.ids.vcf b/test/convert.gt.noHead.ids.vcf index 68f2c521e..70e65fb35 100644 --- a/test/convert.gt.noHead.ids.vcf +++ b/test/convert.gt.noHead.ids.vcf @@ -1,36 +1,33 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 NA00004 NA00005 NA00006 NA00007 NA00008 NA00009 NA00010 -X 2698560 X:2698560_G_A G A . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698630 X:2698630_A_G A G . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698758 X:2698758_CAA_C CAA C . . . GT 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698769 X:2698769_AAG_A AAG A . . . GT 1|0 1|1 0|1 1|0 1|0 0|0 0|0 0|0 0|0 0|0 -X 2698770 X:2698770_AG_A AG A . . . GT 0|0 0|1 0|0 0|0 1|0 0|0 0|0 0|0 0|0 0|0 -X 2698770 X:2698770_AG_AAGG AG AAGG . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698789 X:2698789_C_G C G . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698822 X:2698822_A_C A C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698831 X:2698831_G_A G A . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698889 X:2698889_T_C T C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698923 X:2698923_G_A G A . . . GT 1|0 0|1 0|1 1|0 1|0 0|0 0|0 0|0 0|0 0|0 -X 2698953 X:2698953_A_AGG A AGG . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2698954 X:2698954_G_A G A . . . GT 1|0 1|1 0|1 1|0 1|0 0|0 0|0 0|1 0|0 0|0 -X 2699002 X:2699002_C_A C A . . . GT 0|0 0|0 .|. 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2699025 X:2699025_T_C T C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2699091 X:2699091_G_A G A . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2699187 X:2699187_T_C T C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0/1 -X 2699188 X:2699188_G_C G C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 -X 2699189 X:2699189_T_C T C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 -X 2699217 X:2699217_C_T C T . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2699246 X:2699246_C_A C A . . . GT 1|0 1|1 0|1 1|1 1|0 0|1 0|0 0|1 0|0 1|0 -X 2699275 X:2699275_T_G T G . . . GT 0|0 0|0 1|0 0|0 0|1 1|1 1|0 0|0 0|0 0|1 -X 2699350 X:2699350_A_T A T . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 -X 2699360 X:2699360_T_C T C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 -X 2699450 X:2699450_A_C A C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 -X 2699507 X:2699507_T_C T C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 -X 2699555 X:2699555_C_A C A . . . GT 0 1 1 0 1 1|1 1|0 0|1 0|0 0|1 -X 2699645 X:2699645_G_T G T . . . GT 0 1 0 0 0 0|0 0|0 0|1 0|0 0|0 -X 2699676 X:2699676_G_A G A . . . GT 0 0 1 0 1 1|0 1|0 0|0 0|0 0|1 -X 2699728 X:2699728_C_T C T . . . GT 0 0 0 0 0 0|0 0|0 0|0 0|0 0|0 -X 2699775 X:2699775_C_A C A . . . GT 0 0 0 0 0 0|0 0|0 0|0 0|0 0|0 -X 2699898 X:2699898_C_CT C CT . . . GT 0 0 1 0 1 1|0 1|0 0|0 0|0 0|1 -X 2699968 X:2699968_A_G A G . . . GT . 0 0 1 0 0|1 0|0 0|0 0|1 1|0 -X 2699970 X:2699970_T_C T C . . . GT 0 0 0 0 0 0|0 0|0 0|0 0|0 0|0 -X 2699990 X:2699990_C__2700054 C . . END=2700054 GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 +X 2698560 id9 G A . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698630 id10 A G . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698758 id11 CAA C . . . GT 0|0 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698769 id12 AAG A . . . GT 1|0 1|1 0|1 1|0 1|0 0|0 0|0 0|0 0|0 0|0 +X 2698789 id14 C G . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698822 id15 A C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698831 id16 G A . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698889 id17 T C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698923 id18 G A . . . GT 1|0 0|1 0|1 1|0 1|0 0|0 0|0 0|0 0|0 0|0 +X 2698953 id19 A AGG . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2698954 id20 G A . . . GT 1|0 1|1 0|1 1|0 1|0 0|0 0|0 0|1 0|0 0|0 +X 2699002 id21 C A . . . GT 0|0 0|0 .|. 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2699025 id22 T C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2699091 id23 G A . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2699187 id24 T C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0/1 +X 2699188 id25 G C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 +X 2699189 id26 T C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 +X 2699217 id27 C T . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2699246 id28 C A . . . GT 1|0 1|1 0|1 1|1 1|0 0|1 0|0 0|1 0|0 1|0 +X 2699275 id29 T G . . . GT 0|0 0|0 1|0 0|0 0|1 1|1 1|0 0|0 0|0 0|1 +X 2699350 id30 A T . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 +X 2699360 id31 T C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 +X 2699450 id32 A C . . . GT 0|0 0|0 1|0 0|0 0|1 1|0 1|0 0|0 0|0 0|1 +X 2699507 id33 T C . . . GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +X 2699555 id34 C A . . . GT 0 1 1 0 1 1|1 1|0 0|1 0|0 0|1 +X 2699645 id35 G T . . . GT 0 1 0 0 0 0|0 0|0 0|1 0|0 0|0 +X 2699676 id36 G A . . . GT 0 0 1 0 1 1|0 1|0 0|0 0|0 0|1 +X 2699728 id37 C T . . . GT 0 0 0 0 0 0|0 0|0 0|0 0|0 0|0 +X 2699775 id38 C A . . . GT 0 0 0 0 0 0|0 0|0 0|0 0|0 0|0 +X 2699898 id39 C CT . . . GT 0 0 1 0 1 1|0 1|0 0|0 0|0 0|1 +X 2699968 id40 A G . . . GT . 0 0 1 0 0|1 0|0 0|0 0|1 1|0 +X 2699970 id41 T C . . . GT 0 0 0 0 0 0|0 0|0 0|0 0|0 0|0 diff --git a/test/convert.hls.ids.legend b/test/convert.hls.ids.legend new file mode 100644 index 000000000..4d40f8db8 --- /dev/null +++ b/test/convert.hls.ids.legend @@ -0,0 +1,33 @@ +id position a0 a1 +id9 2698560 G A +id10 2698630 A G +id11 2698758 CAA C +id12 2698769 AAG A +id14 2698789 C G +id15 2698822 A C +id16 2698831 G A +id17 2698889 T C +id18 2698923 G A +id19 2698953 A AGG +id20 2698954 G A +id21 2699002 C A +id22 2699025 T C +id23 2699091 G A +id24 2699187 T C +id25 2699188 G C +id26 2699189 T C +id27 2699217 C T +id28 2699246 C A +id29 2699275 T G +id30 2699350 A T +id31 2699360 T C +id32 2699450 A C +id33 2699507 T C +id34 2699555 C A +id35 2699645 G T +id36 2699676 G A +id37 2699728 C T +id38 2699775 C A +id39 2699898 C CT +id40 2699968 A G +id41 2699970 T C diff --git a/test/convert.hs.gt.ids.hap b/test/convert.hs.gt.ids.hap new file mode 100644 index 000000000..a091427fe --- /dev/null +++ b/test/convert.hs.gt.ids.hap @@ -0,0 +1,32 @@ +X:2698560_G_A id9 2698560 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698630_A_G id10 2698630 A G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698758_CAA_C id11 2698758 CAA C 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698769_AAG_A id12 2698769 AAG A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X:2698789_C_G id14 2698789 C G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698822_A_C id15 2698822 A C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698831_G_A id16 2698831 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698889_T_C id17 2698889 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698923_G_A id18 2698923 G A 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X:2698953_A_AGG id19 2698953 A AGG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698954_G_A id20 2698954 G A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 +X:2699002_C_A id21 2699002 C A 0 0 0 0 ? ? 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699025_T_C id22 2699025 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699091_G_A id23 2699091 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699187_T_C id24 2699187 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0* 1* +X:2699188_G_C id25 2699188 G C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699189_T_C id26 2699189 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699217_C_T id27 2699217 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699246_C_A id28 2699246 C A 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 +X:2699275_T_G id29 2699275 T G 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 +X:2699350_A_T id30 2699350 A T 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699360_T_C id31 2699360 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699450_A_C id32 2699450 A C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699507_T_C id33 2699507 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699555_C_A id34 2699555 C A 0 - 1 - 1 - 0 - 1 - 1 1 1 0 0 1 0 0 0 1 +X:2699645_G_T id35 2699645 G T 0 - 1 - 0 - 0 - 0 - 0 0 0 0 0 1 0 0 0 0 +X:2699676_G_A id36 2699676 G A 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X:2699728_C_T id37 2699728 C T 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X:2699775_C_A id38 2699775 C A 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X:2699898_C_CT id39 2699898 C CT 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X:2699968_A_G id40 2699968 A G ? - 0 - 0 - 1 - 0 - 0 1 0 0 0 0 0 1 1 0 +X:2699970_T_C id41 2699970 T C 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 diff --git a/test/convert.hs.hap b/test/convert.hs.hap index 47589ade6..a9cbbdd80 100644 --- a/test/convert.hs.hap +++ b/test/convert.hs.hap @@ -1,32 +1,32 @@ -X:2698560_G_A X:2698560_G_A 2698560 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698630_A_G X:2698630_A_G 2698630 A G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698758_CAA_C X:2698758_CAA_C 2698758 CAA C 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698769_AAG_A X:2698769_AAG_A 2698769 AAG A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 -X:2698789_C_G X:2698789_C_G 2698789 C G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698822_A_C X:2698822_A_C 2698822 A C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698831_G_A X:2698831_G_A 2698831 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698889_T_C X:2698889_T_C 2698889 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698923_G_A X:2698923_G_A 2698923 G A 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 -X:2698953_A_AGG X:2698953_A_AGG 2698953 A AGG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2698954_G_A X:2698954_G_A 2698954 G A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 -X:2699002_C_A X:2699002_C_A 2699002 C A 0 0 0 0 ? ? 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2699025_T_C X:2699025_T_C 2699025 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2699091_G_A X:2699091_G_A 2699091 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2699187_T_C X:2699187_T_C 2699187 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0* 1* -X:2699188_G_C X:2699188_G_C 2699188 G C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X:2699189_T_C X:2699189_T_C 2699189 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X:2699217_C_T X:2699217_C_T 2699217 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2699246_C_A X:2699246_C_A 2699246 C A 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 -X:2699275_T_G X:2699275_T_G 2699275 T G 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 -X:2699350_A_T X:2699350_A_T 2699350 A T 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X:2699360_T_C X:2699360_T_C 2699360 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X:2699450_A_C X:2699450_A_C 2699450 A C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X:2699507_T_C X:2699507_T_C 2699507 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X:2699555_C_A X:2699555_C_A 2699555 C A 0 - 1 - 1 - 0 - 1 - 1 1 1 0 0 1 0 0 0 1 -X:2699645_G_T X:2699645_G_T 2699645 G T 0 - 1 - 0 - 0 - 0 - 0 0 0 0 0 1 0 0 0 0 -X:2699676_G_A X:2699676_G_A 2699676 G A 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 -X:2699728_C_T X:2699728_C_T 2699728 C T 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 -X:2699775_C_A X:2699775_C_A 2699775 C A 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 -X:2699898_C_CT X:2699898_C_CT 2699898 C CT 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 -X:2699968_A_G X:2699968_A_G 2699968 A G ? - 0 - 0 - 1 - 0 - 0 1 0 0 0 0 0 1 1 0 -X:2699970_T_C X:2699970_T_C 2699970 T C 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X X:2698560_G_A 2698560 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698630_A_G 2698630 A G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698758_CAA_C 2698758 CAA C 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698769_AAG_A 2698769 AAG A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X X:2698789_C_G 2698789 C G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698822_A_C 2698822 A C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698831_G_A 2698831 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698889_T_C 2698889 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698923_G_A 2698923 G A 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X X:2698953_A_AGG 2698953 A AGG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2698954_G_A 2698954 G A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 +X X:2699002_C_A 2699002 C A 0 0 0 0 ? ? 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2699025_T_C 2699025 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2699091_G_A 2699091 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2699187_T_C 2699187 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0* 1* +X X:2699188_G_C 2699188 G C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X X:2699189_T_C 2699189 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X X:2699217_C_T 2699217 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2699246_C_A 2699246 C A 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 +X X:2699275_T_G 2699275 T G 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 +X X:2699350_A_T 2699350 A T 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X X:2699360_T_C 2699360 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X X:2699450_A_C 2699450 A C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X X:2699507_T_C 2699507 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X X:2699555_C_A 2699555 C A 0 - 1 - 1 - 0 - 1 - 1 1 1 0 0 1 0 0 0 1 +X X:2699645_G_T 2699645 G T 0 - 1 - 0 - 0 - 0 - 0 0 0 0 0 1 0 0 0 0 +X X:2699676_G_A 2699676 G A 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X X:2699728_C_T 2699728 C T 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X X:2699775_C_A 2699775 C A 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X X:2699898_C_CT 2699898 C CT 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X X:2699968_A_G 2699968 A G ? - 0 - 0 - 1 - 0 - 0 1 0 0 0 0 0 1 1 0 +X X:2699970_T_C 2699970 T C 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 diff --git a/test/convert.hs.ids.hap b/test/convert.hs.ids.hap new file mode 100644 index 000000000..a091427fe --- /dev/null +++ b/test/convert.hs.ids.hap @@ -0,0 +1,32 @@ +X:2698560_G_A id9 2698560 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698630_A_G id10 2698630 A G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698758_CAA_C id11 2698758 CAA C 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698769_AAG_A id12 2698769 AAG A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X:2698789_C_G id14 2698789 C G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698822_A_C id15 2698822 A C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698831_G_A id16 2698831 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698889_T_C id17 2698889 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698923_G_A id18 2698923 G A 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X:2698953_A_AGG id19 2698953 A AGG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698954_G_A id20 2698954 G A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 +X:2699002_C_A id21 2699002 C A 0 0 0 0 ? ? 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699025_T_C id22 2699025 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699091_G_A id23 2699091 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699187_T_C id24 2699187 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0* 1* +X:2699188_G_C id25 2699188 G C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699189_T_C id26 2699189 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699217_C_T id27 2699217 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699246_C_A id28 2699246 C A 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 +X:2699275_T_G id29 2699275 T G 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 +X:2699350_A_T id30 2699350 A T 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699360_T_C id31 2699360 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699450_A_C id32 2699450 A C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699507_T_C id33 2699507 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699555_C_A id34 2699555 C A 0 - 1 - 1 - 0 - 1 - 1 1 1 0 0 1 0 0 0 1 +X:2699645_G_T id35 2699645 G T 0 - 1 - 0 - 0 - 0 - 0 0 0 0 0 1 0 0 0 0 +X:2699676_G_A id36 2699676 G A 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X:2699728_C_T id37 2699728 C T 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X:2699775_C_A id38 2699775 C A 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X:2699898_C_CT id39 2699898 C CT 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X:2699968_A_G id40 2699968 A G ? - 0 - 0 - 1 - 0 - 0 1 0 0 0 0 0 1 1 0 +X:2699970_T_C id41 2699970 T C 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 diff --git a/test/convert.vcf b/test/convert.vcf index b2e1b2e0a..ea071c54e 100644 --- a/test/convert.vcf +++ b/test/convert.vcf @@ -6,36 +6,36 @@ ##FORMAT= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 NA00004 NA00005 NA00006 NA00007 NA00008 NA00009 NA00010 -X 2698560 . G A 102 . . GT:PL:GP 0|0:0,21,177:1,0,0 0|0:0,30,206:1,0,0 0|0:0,21,177:1,0,0 0|0:0,15,132:1,0,0 0|0:0,9,90:1,0,0 0|0:0,15,114:1,0,0 0|0:0,15,118:1,0,0 0|0:0,15,133:1,0,0 0|0:0,15,144:1,0,0 0|0:0,24,191:1,0,0 -X 2698630 . A G 537 . . GT:PL:GP 0|0:0,21,186:1,0,0 0|0:0,21,176:1,0,0 0|0:0,15,106:1,0,0 0|0:0,18,127:1,0,0 0|0:0,6,62:1,0,0 0|0:0,15,146:1,0,0 0|0:0,18,141:1,0,0 0|0:0,21,173:1,0,0 0|0:0,12,119:1,0,0 0|0:0,15,145:1,0,0 -X 2698758 . CAA C 999 . . GT:PL:GP 0|0:0,6,16:0.8292,0.1708,0 0|1:0,0,0:0.0278,0.5743,0.3979 0|0:0,0,0:0.6336,0.3664,0 0|0:0,3,8:0.8611,0.1389,0 0|0:0,0,8:0.7628,0.2372,0 0|0:0,9,18:1,0,0 0|0:0,9,23:1,0,0 0|0:0,9,15:0.9855,0.0145,0 0|0:0,6,10:1,0,0 0|0:0,21,33:1,0,0 -X 2698769 . AAG A 999 . . GT:PL:GP 1|0:17,0,7:0.0069,0.9931,0 1|1:0,0,0:0.0004,0.0892,0.9104 0|1:17,3,0:0.0045,0.9954,0.0001 1|0:11,0,2:0.0085,0.9915,0 1|0:11,0,15:0.0003,0.9997,0 0|0:0,15,40:1,0,0 0|0:0,9,23:1,0,0 0|0:0,15,25:0.8474,0.1526,0 0|0:0,15,34:1,0,0 0|0:0,33,56:1,0,0 -X 2698770 . AG A,AAGG 999 . . GT:PL:GP 0|0:0,12,103,12,103,103:0.925,0.0717,0,0.0033,0,0 0|1:0,3,21,3,21,21:0.4944,0.368,0.0018,0.1343,0.0013,0.0002 0|0:0,0,0,0,0,0:0.5458,0.4085,0,0.0457,0,0 0|0:0,3,36,3,36,36:0.8126,0.1758,0,0.0116,0,0 1|0:37,0,86,49,92,130:0,1,0,0,0,0 0|0:0,15,125,15,125,125:1,0,0,0,0,0 0|0:0,9,105,9,105,105:1,0,0,0,0,0 0|0:0,15,109,15,109,109:0.9964,0.0034,0,0.0002,0,0 0|0:0,15,137,15,137,137:1,0,0,0,0,0 0|0:0,33,215,33,215,215:1,0,0,0,0,0 -X 2698789 . C G 153 . . GT:PL:GP 0|0:0,21,152:1,0,0 0|0:0,21,131:1,0,0 0|0:0,12,113:1,0,0 0|0:0,12,104:1,0,0 0|0:0,21,137:1,0,0 0|0:0,15,118:0.9999,0.0001,0 0|0:0,15,111:1,0,0 0|0:0,24,152:1,0,0 0|0:0,18,147:1,0,0 0|0:0,33,183:1,0,0 -X 2698822 . A C 85.2 . . GT:PL:GP 0|0:0,21,167:1,0,0 0|0:0,21,171:1,0,0 0|0:0,21,158:1,0,0 0|0:0,18,154:1,0,0 0|0:0,15,135:1,0,0 0|0:0,15,132:1,0,0 0|0:0,21,168:1,0,0 0|0:0,21,175:1,0,0 0|0:0,15,142:1,0,0 0|0:0,21,172:1,0,0 -X 2698831 . G A 303 . . GT:PL:GP 0|0:0,15,129:1,0,0 0|0:0,27,179:1,0,0 0|0:0,24,196:1,0,0 0|0:0,21,158:1,0,0 0|0:0,18,154:1,0,0 0|0:0,12,112:1,0,0 0|0:0,24,162:1,0,0 0|0:0,21,168:1,0,0 0|0:0,9,95:1,0,0 0|0:0,21,164:1,0,0 -X 2698889 . T C 74.4 . . GT:PL:GP 0|0:0,27,193:1,0,0 0|0:0,45,255:1,0,0 0|0:0,21,190:1,0,0 0|0:0,36,254:1,0,0 0|0:0,30,226:1,0,0 0|0:0,36,253:1,0,0 0|0:0,18,156:1,0,0 0|0:0,9,87:1,0,0 0|0:0,9,98:1,0,0 0|0:0,24,205:1,0,0 -X 2698923 . G A 999 . . GT:PL:GP 1|0:62,0,133:0,1,0 0|1:164,0,91:0,1,0 0|1:35,0,73:0,1,0 1|0:91,0,108:0,1,0 1|0:67,0,71:0,1,0 0|0:0,30,187:1,0,0 0|0:0,9,73:1,0,0 0|0:0,12,99:1,0,0 0|0:0,18,153:1,0,0 0|0:0,18,138:1,0,0 -X 2698953 . A AGG 267 . . GT:PL:GP 0|0:0,27,111:1,0,0 0|0:0,33,124:1,0,0 0|0:0,12,62:1,0,0 0|0:0,15,86:1,0,0 0|0:0,12,58:1,0,0 0|0:0,15,69:1,0,0 0|0:0,6,34:1,0,0 0|0:0,18,83:1,0,0 0|0:0,18,80:1,0,0 0|0:0,15,74:1,0,0 -X 2698954 . G A 999 . . GT:PL:GP 1|0:69,0,139:0,1,0 1|1:199,24,0:0,0,1 0|1:15,0,82:0,1,0 1|0:32,0,76:0,1,0 1|0:16,0,80:0,1,0 0|0:0,15,131:1,0,0 0|0:0,6,58:1,0,0 0|1:99,0,39:0,1,0 0|0:0,18,163:1,0,0 0|0:0,15,136:1,0,0 -X 2699002 . C A 65.1 . . GT:PL:GP 0|0:0,18,144:1,0,0 0|0:0,12,115:1,0,0 .|.:0,12,120:1,0,0 0|0:0,15,131:1,0,0 0|0:0,6,29:1,0,0 0|0:0,9,95:1,0,0 0|0:0,9,79:1,0,0 0|0:0,24,188:1,0,0 0|0:0,15,124:1,0,0 0|0:0,9,93:1,0,0 -X 2699025 . T C 44.9 . . GT:PL:GP 0|0:0,24,189:1,0,0 0|0:0,12,98:1,0,0 0|0:0,15,130:1,0,0 0|0:0,15,113:1,0,0 0|0:0,6,63:1,0,0 0|0:0,24,198:1,0,0 0|0:0,12,92:1,0,0 0|0:0,24,197:1,0,0 0|0:0,9,97:1,0,0 0|0:0,12,108:1,0,0 -X 2699091 . G A 45 . . GT:PL:GP 0|0:0,18,162:1,0,0 0|0:0,21,153:1,0,0 0|0:0,12,101:1,0,0 0|0:0,12,97:1,0,0 0|0:0,24,188:1,0,0 0|0:0,24,194:1,0,0 0|0:0,15,127:1,0,0 0|0:0,21,169:1,0,0 0|0:0,15,129:1,0,0 0|0:0,21,171:1,0,0 -X 2699187 . T C 999 . . GT:PL:GP 0|0:0,24,200:1,0,0 0|0:0,24,191:1,0,0 1|0:48,0,85:0,1,0 0|0:0,15,145:1,0,0 0|1:58,0,45:0,1,0 1|0:61,0,50:0,1,0 1|0:22,0,51:0,1,0 0|0:0,27,211:1,0,0 0|0:0,9,96:0.9999,0.0001,0 0/1:23,0,160:0,1,0 -X 2699188 . G C 999 . . GT:PL:GP 0|0:0,24,194:1,0,0 0|0:0,24,167:1,0,0 1|0:48,0,78:0,1,0 0|0:0,15,131:1,0,0 0|1:63,0,40:0,1,0 1|0:50,0,44:0,1,0 1|0:22,0,48:0,1,0 0|0:0,27,212:1,0,0 0|0:0,9,87:0.9999,0.0001,0 0|1:23,0,154:0,1,0 -X 2699189 . T C 999 . . GT:PL:GP 0|0:0,24,199:1,0,0 0|0:0,24,176:1,0,0 1|0:44,0,87:0,1,0 0|0:0,15,136:1,0,0 0|1:62,0,46:0,1,0 1|0:61,0,46:0,1,0 1|0:22,0,49:0,1,0 0|0:0,27,212:1,0,0 0|0:0,9,93:0.9999,0.0001,0 0|1:23,0,164:0,1,0 -X 2699217 . C T 60.3 . . GT:PL:GP 0|0:0,18,158:1,0,0 0|0:0,18,119:1,0,0 0|0:0,21,152:1,0,0 0|0:0,21,162:1,0,0 0|0:0,12,102:1,0,0 0|0:0,18,144:1,0,0 0|0:0,12,108:1,0,0 0|0:0,18,146:1,0,0 0|0:0,12,98:1,0,0 0|0:0,18,155:1,0,0 -X 2699246 . C A 999 . . GT:PL:GP 1|0:128,0,15:0,0.9998,0.0002 1|1:147,21,0:0,0.0001,0.9999 0|1:130,0,5:0,0.9977,0.0023 1|1:237,33,0:0,0,1 1|0:45,0,75:0,1,0 0|1:145,0,49:0,1,0 0|0:0,15,109:1,0,0 0|1:13,0,63:0.0002,0.9998,0 0|0:0,30,178:0.9953,0.0047,0 1|0:120,0,57:0,1,0 -X 2699275 . T G 999 . . GT:PL:GP 0|0:0,18,165:0.9998,0.0002,0 0|0:0,18,152:1,0,0 1|0:0,9,95:0.0023,0.9977,0 0|0:0,33,239:1,0,0 0|1:125,0,40:0,1,0 1|1:205,27,0:0,0,1 1|0:69,0,43:0,1,0 0|0:0,15,139:1,0,0 0|0:0,30,219:1,0,0 0|1:96,0,54:0,1,0 -X 2699350 . A T 999 . . GT:PL:GP 0|0:0,27,206:1,0,0 0|0:0,15,139:1,0,0 1|0:54,0,25:0,1,0 0|0:0,12,117:0.9996,0.0004,0 0|1:79,0,73:0,1,0 1|0:48,0,82:0,1,0 1|0:68,0,45:0,1,0 0|0:0,30,216:1,0,0 0|0:0,27,224:1,0,0 0|1:48,0,80:0,1,0 -X 2699360 . T C 999 . . GT:PL:GP 0|0:0,21,184:1,0,0 0|0:0,15,133:1,0,0 1|0:53,0,21:0,1,0 0|0:0,12,114:0.9996,0.0004,0 0|1:20,0,66:0,1,0 1|0:40,0,93:0,1,0 1|0:52,0,66:0,1,0 0|0:0,30,220:1,0,0 0|0:0,21,191:1,0,0 0|1:20,0,83:0,1,0 -X 2699450 . A C 999 . . GT:PL:GP 0|0:0,12,124:1,0,0 0|0:0,6,55:0.9976,0.0024,0 1|0:99,0,42:0,1,0 0|0:0,21,186:0.9999,0.0001,0 0|1:64,0,100:0,1,0 1|0:38,0,177:0,1,0 1|0:16,0,103:0,1,0 0|0:0,24,202:1,0,0 0|0:0,12,119:1,0,0 0|1:75,0,115:0,1,0 -X 2699507 . T C 195 . . GT:PL:GP 0|0:0,15,133:1,0,0 0|0:0,12,122:1,0,0 0|0:0,6,60:1,0,0 0|0:0,18,123:1,0,0 0|0:0,15,145:1,0,0 0|0:0,21,173:1,0,0 0|0:0,21,178:1,0,0 0|0:0,24,200:1,0,0 0|0:0,12,125:1,0,0 0|0:0,24,189:1,0,0 -X 2699555 . C A 999 . . GT:PL:GP 0:0,156:1,0 1:58,19:0,1 1:51,0:0,1 0:0,91:1,0 1:89,0:0,1 1|1:132,15,0:0,0,1 1|0:99,0,68:0,1,0 0|1:101,0,101:0,1,0 0|0:0,18,161:0.9998,0.0002,0 0|1:118,0,72:0,1,0 -X 2699645 . G T 999 . . GT:PL:GP 0:0,95:1,0 1:49,0:0,1 0:0,58:1,0 0:0,64:1,0 0:0,113:1,0 0|0:0,18,158:1,0,0 0|0:0,18,146:1,0,0 0|1:68,0,136:0,1,0 0|0:0,30,210:1,0,0 0|0:0,27,186:1,0,0 -X 2699676 . G A 999 . . GT:PL:GP 0:0,84:1,0 0:0,87:1,0 1:35,0:0,1 0:0,28:1,0 1:114,0:0,1 1|0:99,0,72:0,1,0 1|0:48,0,89:0,1,0 0|0:0,18,155:1,0,0 0|0:0,24,191:1,0,0 0|1:99,0,61:0,1,0 -X 2699728 . C T 69.7 . . GT:PL:GP 0:0,58:1,0 0:0,64:1,0 0:0,33:1,0 0:0,69:1,0 0:0,81:1,0 0|0:0,27,183:1,0,0 0|0:0,45,220:1,0,0 0|0:0,30,161:1,0,0 0|0:0,15,110:1,0,0 0|0:0,21,156:1,0,0 -X 2699775 . C A 71.1 . . GT:PL:GP 0:0,62:1,0 0:0,101:1,0 0:0,130:1,0 0:0,141:1,0 0:0,54:1,0 0|0:0,30,203:1,0,0 0|0:0,39,208:1,0,0 0|0:0,30,177:1,0,0 0|0:0,18,132:1,0,0 0|0:0,15,103:1,0,0 -X 2699898 . C CT 999 . . GT:PL:GP 0:0,32:1,0 0:0,11:1,0 1:11,0:0,1 0:0,11:1,0 1:31,0:0,1 1|0:11,0,24:0.0438,0.9562,0 1|0:8,0,17:0,1,0 0|0:0,33,72:1,0,0 0|0:0,27,69:1,0,0 0|1:11,4,12:0.0003,0.9997,0 -X 2699968 . A G 999 . . GT:PL:GP .:0,84:1,0 0:0,32:1,0 0:0,57:1,0 1:131,0:0,1 0:0,66:1,0 0|1:89,0,44:0,1,0 0|0:0,18,157:1,0,0 0|0:0,45,255:1,0,0 0|1:75,0,109:0,1,0 1|0:98,0,62:0,1,0 -X 2699970 . T C 55.3 . . GT:PL:GP 0:0,68:1,0 0:0,34:1,0 0:0,32:1,0 0:0,162:1,0 0:0,63:1,0 0|0:0,15,149:1,0,0 0|0:0,21,181:1,0,0 0|0:0,45,255:1,0,0 0|0:0,27,207:1,0,0 0|0:0,24,196:1,0,0 +X 2698560 id9 G A 102 . . GT:PL:GP 0|0:0,21,177:1,0,0 0|0:0,30,206:1,0,0 0|0:0,21,177:1,0,0 0|0:0,15,132:1,0,0 0|0:0,9,90:1,0,0 0|0:0,15,114:1,0,0 0|0:0,15,118:1,0,0 0|0:0,15,133:1,0,0 0|0:0,15,144:1,0,0 0|0:0,24,191:1,0,0 +X 2698630 id10 A G 537 . . GT:PL:GP 0|0:0,21,186:1,0,0 0|0:0,21,176:1,0,0 0|0:0,15,106:1,0,0 0|0:0,18,127:1,0,0 0|0:0,6,62:1,0,0 0|0:0,15,146:1,0,0 0|0:0,18,141:1,0,0 0|0:0,21,173:1,0,0 0|0:0,12,119:1,0,0 0|0:0,15,145:1,0,0 +X 2698758 id11 CAA C 999 . . GT:PL:GP 0|0:0,6,16:0.8292,0.1708,0 0|1:0,0,0:0.0278,0.5743,0.3979 0|0:0,0,0:0.6336,0.3664,0 0|0:0,3,8:0.8611,0.1389,0 0|0:0,0,8:0.7628,0.2372,0 0|0:0,9,18:1,0,0 0|0:0,9,23:1,0,0 0|0:0,9,15:0.9855,0.0145,0 0|0:0,6,10:1,0,0 0|0:0,21,33:1,0,0 +X 2698769 id12 AAG A 999 . . GT:PL:GP 1|0:17,0,7:0.0069,0.9931,0 1|1:0,0,0:0.0004,0.0892,0.9104 0|1:17,3,0:0.0045,0.9954,0.0001 1|0:11,0,2:0.0085,0.9915,0 1|0:11,0,15:0.0003,0.9997,0 0|0:0,15,40:1,0,0 0|0:0,9,23:1,0,0 0|0:0,15,25:0.8474,0.1526,0 0|0:0,15,34:1,0,0 0|0:0,33,56:1,0,0 +X 2698770 id13 AG A,AAGG 999 . . GT:PL:GP 0|0:0,12,103,12,103,103:0.925,0.0717,0,0.0033,0,0 0|1:0,3,21,3,21,21:0.4944,0.368,0.0018,0.1343,0.0013,0.0002 0|0:0,0,0,0,0,0:0.5458,0.4085,0,0.0457,0,0 0|0:0,3,36,3,36,36:0.8126,0.1758,0,0.0116,0,0 1|0:37,0,86,49,92,130:0,1,0,0,0,0 0|0:0,15,125,15,125,125:1,0,0,0,0,0 0|0:0,9,105,9,105,105:1,0,0,0,0,0 0|0:0,15,109,15,109,109:0.9964,0.0034,0,0.0002,0,0 0|0:0,15,137,15,137,137:1,0,0,0,0,0 0|0:0,33,215,33,215,215:1,0,0,0,0,0 +X 2698789 id14 C G 153 . . GT:PL:GP 0|0:0,21,152:1,0,0 0|0:0,21,131:1,0,0 0|0:0,12,113:1,0,0 0|0:0,12,104:1,0,0 0|0:0,21,137:1,0,0 0|0:0,15,118:0.9999,0.0001,0 0|0:0,15,111:1,0,0 0|0:0,24,152:1,0,0 0|0:0,18,147:1,0,0 0|0:0,33,183:1,0,0 +X 2698822 id15 A C 85.2 . . GT:PL:GP 0|0:0,21,167:1,0,0 0|0:0,21,171:1,0,0 0|0:0,21,158:1,0,0 0|0:0,18,154:1,0,0 0|0:0,15,135:1,0,0 0|0:0,15,132:1,0,0 0|0:0,21,168:1,0,0 0|0:0,21,175:1,0,0 0|0:0,15,142:1,0,0 0|0:0,21,172:1,0,0 +X 2698831 id16 G A 303 . . GT:PL:GP 0|0:0,15,129:1,0,0 0|0:0,27,179:1,0,0 0|0:0,24,196:1,0,0 0|0:0,21,158:1,0,0 0|0:0,18,154:1,0,0 0|0:0,12,112:1,0,0 0|0:0,24,162:1,0,0 0|0:0,21,168:1,0,0 0|0:0,9,95:1,0,0 0|0:0,21,164:1,0,0 +X 2698889 id17 T C 74.4 . . GT:PL:GP 0|0:0,27,193:1,0,0 0|0:0,45,255:1,0,0 0|0:0,21,190:1,0,0 0|0:0,36,254:1,0,0 0|0:0,30,226:1,0,0 0|0:0,36,253:1,0,0 0|0:0,18,156:1,0,0 0|0:0,9,87:1,0,0 0|0:0,9,98:1,0,0 0|0:0,24,205:1,0,0 +X 2698923 id18 G A 999 . . GT:PL:GP 1|0:62,0,133:0,1,0 0|1:164,0,91:0,1,0 0|1:35,0,73:0,1,0 1|0:91,0,108:0,1,0 1|0:67,0,71:0,1,0 0|0:0,30,187:1,0,0 0|0:0,9,73:1,0,0 0|0:0,12,99:1,0,0 0|0:0,18,153:1,0,0 0|0:0,18,138:1,0,0 +X 2698953 id19 A AGG 267 . . GT:PL:GP 0|0:0,27,111:1,0,0 0|0:0,33,124:1,0,0 0|0:0,12,62:1,0,0 0|0:0,15,86:1,0,0 0|0:0,12,58:1,0,0 0|0:0,15,69:1,0,0 0|0:0,6,34:1,0,0 0|0:0,18,83:1,0,0 0|0:0,18,80:1,0,0 0|0:0,15,74:1,0,0 +X 2698954 id20 G A 999 . . GT:PL:GP 1|0:69,0,139:0,1,0 1|1:199,24,0:0,0,1 0|1:15,0,82:0,1,0 1|0:32,0,76:0,1,0 1|0:16,0,80:0,1,0 0|0:0,15,131:1,0,0 0|0:0,6,58:1,0,0 0|1:99,0,39:0,1,0 0|0:0,18,163:1,0,0 0|0:0,15,136:1,0,0 +X 2699002 id21 C A 65.1 . . GT:PL:GP 0|0:0,18,144:1,0,0 0|0:0,12,115:1,0,0 .|.:0,12,120:1,0,0 0|0:0,15,131:1,0,0 0|0:0,6,29:1,0,0 0|0:0,9,95:1,0,0 0|0:0,9,79:1,0,0 0|0:0,24,188:1,0,0 0|0:0,15,124:1,0,0 0|0:0,9,93:1,0,0 +X 2699025 id22 T C 44.9 . . GT:PL:GP 0|0:0,24,189:1,0,0 0|0:0,12,98:1,0,0 0|0:0,15,130:1,0,0 0|0:0,15,113:1,0,0 0|0:0,6,63:1,0,0 0|0:0,24,198:1,0,0 0|0:0,12,92:1,0,0 0|0:0,24,197:1,0,0 0|0:0,9,97:1,0,0 0|0:0,12,108:1,0,0 +X 2699091 id23 G A 45 . . GT:PL:GP 0|0:0,18,162:1,0,0 0|0:0,21,153:1,0,0 0|0:0,12,101:1,0,0 0|0:0,12,97:1,0,0 0|0:0,24,188:1,0,0 0|0:0,24,194:1,0,0 0|0:0,15,127:1,0,0 0|0:0,21,169:1,0,0 0|0:0,15,129:1,0,0 0|0:0,21,171:1,0,0 +X 2699187 id24 T C 999 . . GT:PL:GP 0|0:0,24,200:1,0,0 0|0:0,24,191:1,0,0 1|0:48,0,85:0,1,0 0|0:0,15,145:1,0,0 0|1:58,0,45:0,1,0 1|0:61,0,50:0,1,0 1|0:22,0,51:0,1,0 0|0:0,27,211:1,0,0 0|0:0,9,96:0.9999,0.0001,0 0/1:23,0,160:0,1,0 +X 2699188 id25 G C 999 . . GT:PL:GP 0|0:0,24,194:1,0,0 0|0:0,24,167:1,0,0 1|0:48,0,78:0,1,0 0|0:0,15,131:1,0,0 0|1:63,0,40:0,1,0 1|0:50,0,44:0,1,0 1|0:22,0,48:0,1,0 0|0:0,27,212:1,0,0 0|0:0,9,87:0.9999,0.0001,0 0|1:23,0,154:0,1,0 +X 2699189 id26 T C 999 . . GT:PL:GP 0|0:0,24,199:1,0,0 0|0:0,24,176:1,0,0 1|0:44,0,87:0,1,0 0|0:0,15,136:1,0,0 0|1:62,0,46:0,1,0 1|0:61,0,46:0,1,0 1|0:22,0,49:0,1,0 0|0:0,27,212:1,0,0 0|0:0,9,93:0.9999,0.0001,0 0|1:23,0,164:0,1,0 +X 2699217 id27 C T 60.3 . . GT:PL:GP 0|0:0,18,158:1,0,0 0|0:0,18,119:1,0,0 0|0:0,21,152:1,0,0 0|0:0,21,162:1,0,0 0|0:0,12,102:1,0,0 0|0:0,18,144:1,0,0 0|0:0,12,108:1,0,0 0|0:0,18,146:1,0,0 0|0:0,12,98:1,0,0 0|0:0,18,155:1,0,0 +X 2699246 id28 C A 999 . . GT:PL:GP 1|0:128,0,15:0,0.9998,0.0002 1|1:147,21,0:0,0.0001,0.9999 0|1:130,0,5:0,0.9977,0.0023 1|1:237,33,0:0,0,1 1|0:45,0,75:0,1,0 0|1:145,0,49:0,1,0 0|0:0,15,109:1,0,0 0|1:13,0,63:0.0002,0.9998,0 0|0:0,30,178:0.9953,0.0047,0 1|0:120,0,57:0,1,0 +X 2699275 id29 T G 999 . . GT:PL:GP 0|0:0,18,165:0.9998,0.0002,0 0|0:0,18,152:1,0,0 1|0:0,9,95:0.0023,0.9977,0 0|0:0,33,239:1,0,0 0|1:125,0,40:0,1,0 1|1:205,27,0:0,0,1 1|0:69,0,43:0,1,0 0|0:0,15,139:1,0,0 0|0:0,30,219:1,0,0 0|1:96,0,54:0,1,0 +X 2699350 id30 A T 999 . . GT:PL:GP 0|0:0,27,206:1,0,0 0|0:0,15,139:1,0,0 1|0:54,0,25:0,1,0 0|0:0,12,117:0.9996,0.0004,0 0|1:79,0,73:0,1,0 1|0:48,0,82:0,1,0 1|0:68,0,45:0,1,0 0|0:0,30,216:1,0,0 0|0:0,27,224:1,0,0 0|1:48,0,80:0,1,0 +X 2699360 id31 T C 999 . . GT:PL:GP 0|0:0,21,184:1,0,0 0|0:0,15,133:1,0,0 1|0:53,0,21:0,1,0 0|0:0,12,114:0.9996,0.0004,0 0|1:20,0,66:0,1,0 1|0:40,0,93:0,1,0 1|0:52,0,66:0,1,0 0|0:0,30,220:1,0,0 0|0:0,21,191:1,0,0 0|1:20,0,83:0,1,0 +X 2699450 id32 A C 999 . . GT:PL:GP 0|0:0,12,124:1,0,0 0|0:0,6,55:0.9976,0.0024,0 1|0:99,0,42:0,1,0 0|0:0,21,186:0.9999,0.0001,0 0|1:64,0,100:0,1,0 1|0:38,0,177:0,1,0 1|0:16,0,103:0,1,0 0|0:0,24,202:1,0,0 0|0:0,12,119:1,0,0 0|1:75,0,115:0,1,0 +X 2699507 id33 T C 195 . . GT:PL:GP 0|0:0,15,133:1,0,0 0|0:0,12,122:1,0,0 0|0:0,6,60:1,0,0 0|0:0,18,123:1,0,0 0|0:0,15,145:1,0,0 0|0:0,21,173:1,0,0 0|0:0,21,178:1,0,0 0|0:0,24,200:1,0,0 0|0:0,12,125:1,0,0 0|0:0,24,189:1,0,0 +X 2699555 id34 C A 999 . . GT:PL:GP 0:0,156:1,0 1:58,19:0,1 1:51,0:0,1 0:0,91:1,0 1:89,0:0,1 1|1:132,15,0:0,0,1 1|0:99,0,68:0,1,0 0|1:101,0,101:0,1,0 0|0:0,18,161:0.9998,0.0002,0 0|1:118,0,72:0,1,0 +X 2699645 id35 G T 999 . . GT:PL:GP 0:0,95:1,0 1:49,0:0,1 0:0,58:1,0 0:0,64:1,0 0:0,113:1,0 0|0:0,18,158:1,0,0 0|0:0,18,146:1,0,0 0|1:68,0,136:0,1,0 0|0:0,30,210:1,0,0 0|0:0,27,186:1,0,0 +X 2699676 id36 G A 999 . . GT:PL:GP 0:0,84:1,0 0:0,87:1,0 1:35,0:0,1 0:0,28:1,0 1:114,0:0,1 1|0:99,0,72:0,1,0 1|0:48,0,89:0,1,0 0|0:0,18,155:1,0,0 0|0:0,24,191:1,0,0 0|1:99,0,61:0,1,0 +X 2699728 id37 C T 69.7 . . GT:PL:GP 0:0,58:1,0 0:0,64:1,0 0:0,33:1,0 0:0,69:1,0 0:0,81:1,0 0|0:0,27,183:1,0,0 0|0:0,45,220:1,0,0 0|0:0,30,161:1,0,0 0|0:0,15,110:1,0,0 0|0:0,21,156:1,0,0 +X 2699775 id38 C A 71.1 . . GT:PL:GP 0:0,62:1,0 0:0,101:1,0 0:0,130:1,0 0:0,141:1,0 0:0,54:1,0 0|0:0,30,203:1,0,0 0|0:0,39,208:1,0,0 0|0:0,30,177:1,0,0 0|0:0,18,132:1,0,0 0|0:0,15,103:1,0,0 +X 2699898 id39 C CT 999 . . GT:PL:GP 0:0,32:1,0 0:0,11:1,0 1:11,0:0,1 0:0,11:1,0 1:31,0:0,1 1|0:11,0,24:0.0438,0.9562,0 1|0:8,0,17:0,1,0 0|0:0,33,72:1,0,0 0|0:0,27,69:1,0,0 0|1:11,4,12:0.0003,0.9997,0 +X 2699968 id40 A G 999 . . GT:PL:GP .:0,84:1,0 0:0,32:1,0 0:0,57:1,0 1:131,0:0,1 0:0,66:1,0 0|1:89,0,44:0,1,0 0|0:0,18,157:1,0,0 0|0:0,45,255:1,0,0 0|1:75,0,109:0,1,0 1|0:98,0,62:0,1,0 +X 2699970 id41 T C 55.3 . . GT:PL:GP 0:0,68:1,0 0:0,34:1,0 0:0,32:1,0 0:0,162:1,0 0:0,63:1,0 0|0:0,15,149:1,0,0 0|0:0,21,181:1,0,0 0|0:0,45,255:1,0,0 0|0:0,27,207:1,0,0 0|0:0,24,196:1,0,0 diff --git a/test/filter.40.out b/test/filter.40.out new file mode 100644 index 000000000..c7e1bd874 --- /dev/null +++ b/test/filter.40.out @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FILTER= +##test= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##contig= +##contig= +##INFO= +##INFO= +##readme=AAAAAA +##readme=BBBBBB +##FILTER= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 1000 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1001 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1003 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1006 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1007 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2000 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2001 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2003 . T TC 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2005 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2006 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1001 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1004 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1008 . GT G 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2001 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2003 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2006 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 diff --git a/test/filter.41.out b/test/filter.41.out new file mode 100644 index 000000000..c525df6e4 --- /dev/null +++ b/test/filter.41.out @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FILTER= +##test= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##contig= +##contig= +##INFO= +##INFO= +##readme=AAAAAA +##readme=BBBBBB +##FILTER= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 1000 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1001 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1003 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1006 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1007 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2000 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2001 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2003 . T TC 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2005 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2006 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1001 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1004 . GT G 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1008 . GT G 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2001 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2003 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2006 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 diff --git a/test/filter.42.out b/test/filter.42.out new file mode 100644 index 000000000..e451d4c75 --- /dev/null +++ b/test/filter.42.out @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FILTER= +##test= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##contig= +##contig= +##INFO= +##INFO= +##readme=AAAAAA +##readme=BBBBBB +##FILTER= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 1000 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1001 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1003 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1006 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1007 . G A 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2000 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2001 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2003 . T TC 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2005 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2006 . T C 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1001 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1004 . GT G 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1008 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2001 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2003 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2006 . A AT 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 diff --git a/test/filter.43.out b/test/filter.43.out new file mode 100644 index 000000000..0ae377789 --- /dev/null +++ b/test/filter.43.out @@ -0,0 +1,35 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FILTER= +##test= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##contig= +##contig= +##INFO= +##INFO= +##readme=AAAAAA +##readme=BBBBBB +##FILTER= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 1000 . G A 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1001 . G A 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1003 . GT G 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1006 . G A 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 1007 . G A 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2000 . T C 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2001 . T C 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2003 . T TC 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2005 . T C 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +1 2006 . T C 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1001 . GT G 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1004 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 1008 . GT G 1806 PASS DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2001 . A AT 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2003 . A AT 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 +2 2006 . A AT 1806 xxx DP=35;DP4=1,2,3,4;AN=2;AC=1 GT:GQ:DP:GL 0/1:409:35:-20,-5,-20 diff --git a/test/fixref.6.out b/test/fixref.6.out new file mode 100644 index 000000000..6c1aaac40 --- /dev/null +++ b/test/fixref.6.out @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///some/path/1000GenomesPilot-NCBI36.fasta +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XY00001 +1 1 rs1 C T . . . GT 0/1 +1 2 rs2 T A . . . GT 0/1 +1 3 rs3 A T . . . GT 0/1 +1 4 rs4 A T . . . GT 0/1 +1 5 rs5 C T . . . GT 0/1 +1 6 rs6 C T . . . GT 0/1 diff --git a/test/mpileup/mpileup-filter.1.out b/test/mpileup/mpileup-filter.1.out new file mode 100644 index 000000000..d62b7e72d --- /dev/null +++ b/test/mpileup/mpileup-filter.1.out @@ -0,0 +1,22 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 100 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,0,0,0,0,25,625,0,0;QS=1,0;FS=0;MQ0F=1 PL 0,3,4 diff --git a/test/mpileup/mpileup-filter.2.out b/test/mpileup/mpileup-filter.2.out new file mode 100644 index 000000000..bf79c3921 --- /dev/null +++ b/test/mpileup/mpileup-filter.2.out @@ -0,0 +1,22 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 100 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;FS=0;MQ0F=0 PL 0,3,34 diff --git a/test/mpileup/mpileup-filter.sam b/test/mpileup/mpileup-filter.sam new file mode 100644 index 000000000..d9b46bf84 --- /dev/null +++ b/test/mpileup/mpileup-filter.sam @@ -0,0 +1,5 @@ +@HD VN:1.4 GO:none SO:coordinate +@SQ SN:1 LN:150 M5:de3677cffe7c371fbb69fd44ce9bc181 UR:/nfs/users/nfs_p/pd3/git/stools/bcftools/test/mpileup/mpileup-SCR.fa +@RG ID:rg SM:sample +HS40_17920:7:2214:9967:66664#16 99 1 64 60 75M * 0 167 CCTAGGTCCCCCCGTGCCACCATGATGCCGTCGCTCACCTCCAGGATTTCATCAAACCTGAGAGGTTGGGAGAAT @A??DE?FEEEEEA@HGEBEDCBHBBHGEA@FAGDFCFEDFECGFBBBBFCBFCCCFEDHBGBGF@BHFEAC>?? BC:Z:TCCGTCTT MC:Z:75M BD:Z:LLMMQPOPNJJJJLMKNMMKMMNNNMNNMLMNMMNKMKNLLMNONMNKEMNOONMDMNMOOMMMOONNPLOMMMM PG:Z:MarkDuplicates BI:Z:PPQQRPRRPLLLLOQORPQNQQQRRQRRPOQQPPROQOQPOOQRPQRPIPRRRRPHQRQSSPPQQSQSSOSPPPQ MQ:i:60 AS:i:75 XS:i:24 QT:Z:BBBBBFFF mc:i:155264286 ms:i:2755 BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ MD:Z:75 NM:i:0 RG:Z:rg +HS40_17920:7:2214:9967:66664#16 3 1 64 0 75M * 0 167 CCTAGGTCCCCCCGTGCCACCATGATGCCGTCGCTCACCTCCAGGATTTCATCAAACCTGAGAGGTTGGGAGAAT @A??DE?FEEEEEA@HGEBEDCBHBBHGEA@FAGDFCFEDFECGFBBBBFCBFCCCFEDHBGBGF@BHFEAC>?? BC:Z:TCCGTCTT MC:Z:75M BD:Z:LLMMQPOPNJJJJLMKNMMKMMNNNMNNMLMNMMNKMKNLLMNONMNKEMNOONMDMNMOOMMMOONNPLOMMMM PG:Z:MarkDuplicates BI:Z:PPQQRPRRPLLLLOQORPQNQQQRRQRRPOQQPPROQOQPOOQRPQRPIPRRRRPHQRQSSPPQQSQSSOSPPPQ MQ:i:60 AS:i:75 XS:i:24 QT:Z:BBBBBFFF mc:i:155264286 ms:i:2755 BQ:Z:@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ MD:Z:75 NM:i:0 RG:Z:rg diff --git a/test/query.84.out b/test/query.84.out new file mode 100644 index 000000000..f97f98f68 --- /dev/null +++ b/test/query.84.out @@ -0,0 +1 @@ + 0,9,1 diff --git a/test/query.filter.13.vcf b/test/query.filter.13.vcf new file mode 100644 index 000000000..ee96fff55 --- /dev/null +++ b/test/query.filter.13.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##reference=test.fa +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12890 +22 16053659 . A C,T 352 . . AD 0,9,1 +22 16053791 . C A,T 287 . . AD 4,4,1 diff --git a/test/query.filter.id.1.out b/test/query.filter.id.1.out new file mode 100644 index 000000000..cd03addd0 --- /dev/null +++ b/test/query.filter.id.1.out @@ -0,0 +1,2 @@ +rs123 +ss124;abc diff --git a/test/query.filter.id.2.out b/test/query.filter.id.2.out new file mode 100644 index 000000000..8c4b5e63a --- /dev/null +++ b/test/query.filter.id.2.out @@ -0,0 +1 @@ +rs123 diff --git a/test/query.filter.id.vcf b/test/query.filter.id.vcf new file mode 100644 index 000000000..d8f700fad --- /dev/null +++ b/test/query.filter.id.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##reference=file:///seq/references/1000Genomes-NCBI37.fasta +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 123 rs123 A . . . . +1 124 ss124;abc A . . . . +1 125 . A . . . . diff --git a/test/query.smpl.1.out b/test/query.smpl.1.out new file mode 100644 index 000000000..97854e6c4 --- /dev/null +++ b/test/query.smpl.1.out @@ -0,0 +1 @@ +11 1/1 diff --git a/test/query.smpl.11.txt b/test/query.smpl.11.txt new file mode 100644 index 000000000..b4de39476 --- /dev/null +++ b/test/query.smpl.11.txt @@ -0,0 +1 @@ +11 diff --git a/test/query.smpl.2.out b/test/query.smpl.2.out new file mode 100644 index 000000000..b4de39476 --- /dev/null +++ b/test/query.smpl.2.out @@ -0,0 +1 @@ +11 diff --git a/test/query.smpl.3.out b/test/query.smpl.3.out new file mode 100644 index 000000000..c1cde23b6 --- /dev/null +++ b/test/query.smpl.3.out @@ -0,0 +1 @@ +00 0/0 diff --git a/test/query.smpl.4.out b/test/query.smpl.4.out new file mode 100644 index 000000000..4daddb72f --- /dev/null +++ b/test/query.smpl.4.out @@ -0,0 +1 @@ +00 diff --git a/test/query.smpl.5.out b/test/query.smpl.5.out new file mode 100644 index 000000000..b9a466071 --- /dev/null +++ b/test/query.smpl.5.out @@ -0,0 +1,2 @@ +11 1/1 +00 0/0 diff --git a/test/query.smpl.6.out b/test/query.smpl.6.out new file mode 100644 index 000000000..73a8a0c17 --- /dev/null +++ b/test/query.smpl.6.out @@ -0,0 +1,2 @@ +00 +11 diff --git a/test/query.smpl.txt b/test/query.smpl.txt new file mode 100644 index 000000000..a61d10ee2 --- /dev/null +++ b/test/query.smpl.txt @@ -0,0 +1,2 @@ +11 +00 diff --git a/test/query.smpl.vcf b/test/query.smpl.vcf new file mode 100644 index 000000000..570d40854 --- /dev/null +++ b/test/query.smpl.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.2 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 00 11 +chr1 10000 . A C . . . GT 0/0 1/1 diff --git a/test/setGT.4.1.out b/test/setGT.4.1.out new file mode 100644 index 000000000..c5f17f7c9 --- /dev/null +++ b/test/setGT.4.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A C +1 1890 . A G . . . GT:DP ./.:12 0/0:102 ./.:10 +1 1991 . G T . . . GT:DP ./.:8 0/1:100 ./.:11 diff --git a/test/setGT.4.2.out b/test/setGT.4.2.out new file mode 100644 index 000000000..901a3a8c5 --- /dev/null +++ b/test/setGT.4.2.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A C +1 1890 . A G . . . GT:DP ./.:12 0/0:102 ./.:10 +1 1991 . G T . . . GT:DP ./.:8 ./.:100 ./.:11 diff --git a/test/setGT.4.vcf b/test/setGT.4.vcf new file mode 100644 index 000000000..b8c18378f --- /dev/null +++ b/test/setGT.4.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B A C +1 1890 . A G . . . GT:DP 1/1:12 0/0:102 0/1:10 1/1:20 +1 1991 . G T . . . GT:DP 1/1:8 0/1:100 0/0:11 0/0:11 diff --git a/test/test.pl b/test/test.pl index e22024612..1de8ef7d6 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2021 Genome Research Ltd. +# Copyright (C) 2012-2022 Genome Research Ltd. # # Author: Petr Danecek # @@ -205,11 +205,20 @@ test_vcf_query($opts,in=>'query.filter.8',out=>'query.70.out',args=>q[-f'%POS\\t%REF\\t%ALT\\t%ILEN\\n' -i'ILEN==1']); test_vcf_query($opts,in=>'query.filter.9',out=>'query.71.out',args=>q[-f'[%POS %SAMPLE %AD\\n]' -i'FMT/AD[:0] < FMT/AD[:1]']); test_vcf_query($opts,in=>'query.filter.9',out=>'query.72.out',args=>q[-f'[%POS %SAMPLE %AD\\n]' -i'FMT/AD[:0] > FMT/AD[:1]']); +test_vcf_query($opts,in=>'query.filter.13',out=>'query.84.out',args=>q[-f'[ %AD\\n]' -i'AD[:1] / sum(AD[*]) > 0.5']); test_vcf_query($opts,in=>'query.filter.10',out=>'query.73.out',args=>q[-f'%POS %NUM_TAG\\n' -i'COUNT(INFO/NUM_TAG)=2']); test_vcf_query($opts,in=>'query.filter.10',out=>'query.74.out',args=>q[-f'%POS %STR_TAG\\n' -i'COUNT(INFO/STR_TAG)=2']); test_vcf_query($opts,in=>'query',out=>'query.75.out',args=>q[-f '%CHROM:%POS\\t%N_PASS(GT="alt" & GQ>110)\\t[\\t%GT]\\t[\\t%GQ]\n']); test_vcf_query($opts,in=>'query.filter.12',out=>'query.82.out',args=>q[-f '%CHROM:%POS[\\t%SAMPLE=%GT]\\n' -e 'GT="mis"' -s 1,3,0]); test_vcf_query($opts,in=>'query.filter.12',out=>'query.83.out',args=>q[-f '%CHROM:%POS[\\t%SAMPLE=%GT]\\n' -e 'GT="mis"' -s 0,1,3]); +test_vcf_query($opts,in=>'query.smpl',out=>'query.smpl.1.out',args=>q[-f "[%SAMPLE %GT\n]" -S {PATH}/query.smpl.11.txt]); +test_vcf_query($opts,in=>'query.smpl',out=>'query.smpl.2.out',args=>q[-l -S {PATH}/query.smpl.11.txt]); +test_vcf_query($opts,in=>'query.smpl',out=>'query.smpl.3.out',args=>q[-f "[%SAMPLE %GT\n]" -S ^{PATH}/query.smpl.11.txt]); +test_vcf_query($opts,in=>'query.smpl',out=>'query.smpl.4.out',args=>q[-l -S ^{PATH}/query.smpl.11.txt]); +test_vcf_query($opts,in=>'query.smpl',out=>'query.smpl.5.out',args=>q[-f "[%SAMPLE %GT\n]" -S {PATH}/query.smpl.txt]); +test_vcf_query($opts,in=>'query.smpl',out=>'query.smpl.6.out',args=>q[-l -S {PATH}/query.smpl.txt]); +test_vcf_query($opts,in=>'query.filter.id',out=>'query.filter.id.1.out',args=>q[-f'%ID\\n' -i'ID~"s12"']); +test_vcf_query($opts,in=>'query.filter.id',out=>'query.filter.id.2.out',args=>q[-f'%ID\\n' -i'ID="rs123"']); test_vcf_norm($opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx'); test_vcf_norm($opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-'); test_vcf_norm($opts,in=>'norm.split.2',out=>'norm.split.2.out',args=>'-m-'); @@ -296,6 +305,10 @@ test_vcf_view($opts,in=>'view.filter.annovar',out=>'view.filter.annovar.1.out',args=>q[-H -i 'Gene.refGene=="RAD21L1"'],reg=>''); test_vcf_view($opts,in=>'view.filter.annovar',out=>'view.filter.annovar.2.out',args=>q[-H -i 'Gene.refGene~"NOD"'],reg=>''); test_vcf_view($opts,in=>'view.filter.annovar',out=>'view.filter.annovar.3.out',args=>q[-H -i 'LJB2_MutationTaster=="0.291000"'],reg=>''); +test_vcf_view($opts,in=>'view-a',out=>'view-a.1.out',args=>q[-H -a]); +test_vcf_view($opts,in=>'view.sites',out=>'view.sites.1.out',args=>'',tgts=>'view.sites.txt'); +test_vcf_view($opts,in=>'view.sites',out=>'view.sites.1.out',args=>'',tgts=>'view.sites.txt.gz'); +test_vcf_head($opts,in=>'mpileup.2.vcf',in_nheaders=>22); test_vcf_call($opts,in=>'mpileup',out=>'mpileup.1.out',args=>'-mv'); test_vcf_call($opts,in=>'mpileup',out=>'mpileup.2.out',args=>'-mg0'); test_vcf_call($opts,in=>'mpileup',out=>'mpileup.3.out',args=>'-mv -S {PATH}/mpileup.3.samples'); @@ -402,6 +415,10 @@ test_vcf_filter($opts,in=>'filter.10',out=>'filter.38.out',args=>q[-i 'FORMAT/DP < sum(AD[*])']); test_vcf_filter($opts,in=>'filter.10',out=>'filter.39.out',args=>q[-i 'sum(AD[*]) < FORMAT/DP']); test_vcf_filter($opts,in=>'filter.10',out=>'filter.39.out',args=>q[-i 'FORMAT/DP > sum(AD[*])']); +test_vcf_filter($opts,in=>'filter.1',out=>'filter.40.out',args=>q[--soft-filter xxx --mask 2:1005-1008 --mask-overlap 0]); +test_vcf_filter($opts,in=>'filter.1',out=>'filter.41.out',args=>q[--soft-filter xxx --mask 2:1005-1008 --mask-overlap 1]); +test_vcf_filter($opts,in=>'filter.1',out=>'filter.42.out',args=>q[--soft-filter xxx --mask 2:1005-1008 --mask-overlap 2]); +test_vcf_filter($opts,in=>'filter.1',out=>'filter.43.out',args=>q[--soft-filter xxx --mask ^2:1005-1008]); test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\t%REF,%ALT\\n'); test_vcf_regions($opts,in=>'regions'); @@ -448,6 +465,7 @@ test_vcf_annotate($opts,in=>'annotate.missing-append',tab=>'annotate.missing-append',out=>'annotate.missing-append.1.out',args=>'-c CHROM,POS,REF,ALT,STR,INT,FLT -l STR:append-missing,INT:append-missing,FLT:append-missing'); test_vcf_annotate($opts,in=>'annotate9',tab=>'annots9',out=>'annotate9.out',args=>'-c CHROM,POS,REF,ALT,+ID'); test_vcf_annotate($opts,in=>'annotate21',out=>'annotate29.out',args=>'--rename-annots {PATH}/annotate21.txt'); +test_vcf_annotate($opts,in=>'annotate21',out=>'annotate29.out',args=>'-c XX:=FORMAT/X-X,YY:=FORMAT/Y-Y,AA:=FORMAT/A-A,AA:=INFO/A-A,BB:=INFO/B-B,XX:=INFO/X-X,YY:=INFO/Y-Y,fltA:=FILTER/flt-A,fltB:=FILTER/flt-B,fltX:=FILTER/flt-X,fltY:=FILTER/flt-Y'); test_vcf_annotate($opts,in=>'annotate22',vcf=>'annotate22',out=>'annotate30.out',args=>'-c FMT/XX,INFO/XX -x FMT/XX,INFO/XX'); test_vcf_annotate($opts,in=>'annotate23',tab=>'annotate23',out=>'annotate31.out',args=>'-c CHROM,POS,~ID,REF,ALT,INFO/END'); test_vcf_annotate($opts,in=>'annotate24.dst',vcf=>'annotate24.src',out=>'annotate24.1.out',args=>'-c XX'); @@ -459,6 +477,10 @@ test_vcf_annotate($opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.4.out',args=>'-c CHROM,POS,REF,ALT,+TSTR,+TFLT,+TINT'); test_vcf_annotate($opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.5.out',args=>'-c CHROM,POS,REF,ALT,.=TSTR,.=TFLT,.=TINT'); test_vcf_annotate($opts,in=>'annotate.missing',tab=>'annotate.missing',out=>'annotate.missing.6.out',args=>'-c CHROM,POS,REF,ALT,=TSTR,=TFLT,=TINT'); +test_vcf_annotate($opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.1.out',args=>'-c CHROM,BEG,END,DB -l DB:unique'); +test_vcf_annotate($opts,in=>'annotate.olap',tab=>'annots.olap',out=>'annotate.olap.2.out',args=>'-c CHROM,BEG,END,DB -l DB:unique --min-overlap 0.4:0.5'); +test_vcf_annotate($opts,in=>'annotate.id',vcf=>'annots.id',out=>'annotate.id.1.out',args=>'-c ALT'); +test_vcf_annotate($opts,in=>'annotate.id',vcf=>'annots.id',out=>'annotate.id.2.out',args=>'-c +ALT'); test_vcf_plugin($opts,in=>'checkploidy',out=>'checkploidy.out',cmd=>'+check-ploidy --no-version'); test_vcf_plugin($opts,in=>'checkploidy.2',out=>'checkploidy.2.out',cmd=>'+check-ploidy --no-version'); test_vcf_plugin($opts,in=>'checkploidy.2',out=>'checkploidy.3.out',cmd=>'+check-ploidy --no-version',args=>'-- -m'); @@ -473,6 +495,8 @@ test_vcf_plugin($opts,in=>'setGT.3',out=>'setGT.3.4.out',cmd=>'+setGT --no-version',args=>'-- -t a -n c:"1|1"'); test_vcf_plugin($opts,in=>'setGT.3',out=>'setGT.3.5.out',cmd=>'+setGT --no-version',args=>'-- -t a -n c:"m|M"'); test_vcf_plugin($opts,in=>'setGT.3',out=>'setGT.3.6.out',cmd=>'+setGT --no-version',args=>'-- -t a -n c:0/1/1'); +test_vcf_plugin($opts,in=>'setGT.4',out=>'setGT.4.1.out',cmd=>'+setGT --no-version',args=>q[-- -t q -n . -e 'FMT/DP>90']); +test_vcf_plugin($opts,in=>'setGT.4',out=>'setGT.4.2.out',cmd=>'+setGT --no-version',args=>q[-- -t q -n . -e 'FMT/DP>100']); test_vcf_plugin($opts,in=>'plugin1',out=>'fill-AN-AC.out',cmd=>'+fill-AN-AC --no-version'); test_vcf_plugin($opts,in=>'dosage',out=>'dosage.1.out',cmd=>'+dosage',args=>'-- -t PL'); test_vcf_plugin($opts,in=>'dosage',out=>'dosage.2.out',cmd=>'+dosage',args=>'-- -t GL'); @@ -521,6 +545,7 @@ test_vcf_plugin($opts,in=>'fixref.3',out=>'fixref.3.out',cmd=>'+fixref',args=>'-- -f {PATH}/fixref.3.fa -m top'); test_vcf_plugin($opts,in=>'fixref.2a',out=>'fixref.4.out',index=>['fixref.2b'],cmd=>'+fixref',args=>'-- -f {PATH}/norm.fa -m ref-alt'); test_vcf_plugin($opts,in=>'fixref.2a',out=>'fixref.5.out',index=>['fixref.2b'],cmd=>'+fixref',args=>'-- -f {PATH}/norm.fa -m flip'); +test_vcf_plugin($opts,in=>'fixref.2a',out=>'fixref.2.out',cmd=>'+fixref',args=>'-- -f {PATH}/norm.fa -m flip-all'); test_vcf_plugin($opts,in=>'aa',out=>'aa.out',cmd=>'+fill-from-fasta',args=>'-- -f {PATH}/aa.fa -c AA -h {PATH}/aa.hdr -i \'TYPE="snp"\''); test_vcf_plugin($opts,in=>'aa',out=>'aa.2.out',cmd=>'+fill-from-fasta',args=>'-- -f {PATH}/aa.fa -c REF -N'); test_vcf_plugin($opts,in=>'ref',out=>'ref.out',cmd=>'+fill-from-fasta',args=>'-- -f {PATH}/norm.fa -c REF'); @@ -632,25 +657,33 @@ test_vcf_reheader($opts,in=>'reheader.2',out=>'reheader.5.out',args=>'-h {PATH}/reheader.2.hdr -f {PATH}/reheader.fai',nostdin=>1); test_rename_chrs($opts,in=>'annotate'); test_vcf_convert($opts,in=>'convert',out=>'convert.gs.gt.gen',args=>'-g -,.'); +test_vcf_convert($opts,in=>'convert',out=>'convert.gs.gt.ids.gen',args=>'-g -,. --vcf-ids'); +test_vcf_convert($opts,in=>'convert',out=>'convert.gs.gt.ids.gen6',args=>'-g -,. --vcf-ids --3N6'); test_vcf_convert($opts,in=>'convert',out=>'convert.gs.gt.samples',args=>'-g .,-'); +test_vcf_convert_hs2vcf($opts,h=>'convert.gs.gt.ids.gen',s=>'convert.gs.gt.samples',out=>'convert.gs.vcf',args=>' --vcf-ids -G'); +test_vcf_convert_hs2vcf($opts,h=>'convert.gs.gt.ids.gen',s=>'convert.gs.gt.samples',out=>'convert.gs.noids.vcf',args=>'-G'); +test_vcf_convert_hs2vcf($opts,h=>'convert.gs.gt.ids.3N6.gen',s=>'convert.gs.gt.samples',out=>'convert.gs.noids.vcf',args=>'--3N6 -G'); +test_vcf_convert_hs2vcf($opts,h=>'convert.gs.gt.ids.gen.rev',s=>'convert.gs.gt.samples',out=>'convert.gs.vcf',args=>'--vcf-ids -G'); +test_vcf_convert_hs2vcf($opts,h=>'convert.gs.gt.ids.gen.rev',s=>'convert.gs.gt.samples',out=>'convert.gs.noids.vcf',args=>'-G'); test_vcf_convert($opts,in=>'convert',out=>'convert.gs.pl.gen',args=>'-g -,. --tag PL'); test_vcf_convert($opts,in=>'convert',out=>'convert.gs.pl.samples',args=>'-g .,- --tag PL'); test_vcf_convert($opts,in=>'check',out=>'check.gs.vcfids.gen',args=>'-g -,. --vcf-ids'); test_vcf_convert($opts,in=>'check',out=>'check.gs.vcfids.samples',args=>'-g .,- --vcf-ids'); -test_vcf_convert($opts,in=>'check',out=>'check.gs.chrom.gen',args=>'-g -,. --chrom'); -test_vcf_convert($opts,in=>'check',out=>'check.gs.chrom.samples',args=>'-g .,- --chrom'); -test_vcf_convert($opts,in=>'check',out=>'check.gs.vcfids_chrom.gen',args=>'-g -,. --chrom --vcf-ids'); -test_vcf_convert($opts,in=>'check',out=>'check.gs.vcfids_chrom.samples',args=>'-g .,- --chrom --vcf-ids'); +test_vcf_convert($opts,in=>'check',out=>'check.gs.chrom.gen',args=>'-g -,. --3N6'); +test_vcf_convert($opts,in=>'check',out=>'check.gs.chrom.samples',args=>'-g .,- --3N6'); +test_vcf_convert($opts,in=>'check',out=>'check.gs.vcfids_chrom.gen',args=>'-g -,. --3N6 --vcf-ids'); +test_vcf_convert($opts,in=>'check',out=>'check.gs.vcfids_chrom.samples',args=>'-g .,- --3N6 --vcf-ids'); test_vcf_convert($opts,in=>'convert',out=>'convert.hls.haps',args=>'-h -,.,.'); test_vcf_convert($opts,in=>'convert',out=>'convert.hls.legend',args=>'-h .,-,.'); +test_vcf_convert($opts,in=>'convert',out=>'convert.hls.ids.legend',args=>'-h .,-,. --vcf-ids'); test_vcf_convert($opts,in=>'convert',out=>'convert.hls.samples',args=>'-h .,.,-'); test_vcf_convert_hls2vcf($opts,h=>'convert.hls.gt.hap',l=>'convert.hls.gt.legend',s=>'convert.hls.gt.samples',out=>'convert.gt.noHead.vcf',args=>'-H'); -test_vcf_convert_hls2vcf($opts,h=>'convert.hls.gt.hap',l=>'convert.hls.gt.legend',s=>'convert.hls.gt.samples',out=>'convert.gt.noHead.ids.vcf',args=>'--vcf-ids -H'); -test_vcf_convert_hs2vcf($opts,h=>'convert.hs.gt.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.vcf',args=>'--hapsample2vcf'); -test_vcf_convert_hs2vcf($opts,h=>'convert.hs.gt.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.ids.vcf',args=>'--vcf-ids --hapsample2vcf'); +test_vcf_convert($opts,in=>'convert.hap-missing',out=>'convert.hap-missing.haps',args=>'--haplegendsample -,.,.'); test_vcf_convert($opts,in=>'convert',out=>'convert.hs.hap',args=>'--hapsample -,.'); +test_vcf_convert($opts,in=>'convert',out=>'convert.hs.ids.hap',args=>'--hapsample -,. --vcf-ids'); test_vcf_convert($opts,in=>'convert',out=>'convert.hs.sample',args=>'--hapsample .,-'); -test_vcf_convert($opts,in=>'convert.hap-missing',out=>'convert.hap-missing.haps',args=>'--haplegendsample -,.,.'); +test_vcf_convert_hs2vcf($opts,h=>'convert.hs.gt.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.vcf',args=>'--hapsample2vcf'); +test_vcf_convert_hs2vcf($opts,h=>'convert.hs.gt.ids.hap',s=>'convert.hs.gt.samples',out=>'convert.gt.noHead.ids.vcf',args=>'--vcf-ids --hapsample2vcf'); test_vcf_convert_gvcf($opts,in=>'convert.gvcf',out=>'convert.gvcf.out',fa=>'gvcf.fa',args=>'--gvcf2vcf -i\'FILTER="PASS"\''); test_vcf_convert_tsv2vcf($opts,in=>'convert.23andme',out=>'convert.23andme.vcf',args=>'-c ID,CHROM,POS,AA -s SAMPLE1',fai=>'23andme'); test_vcf_consensus($opts,in=>'consensus',out=>'consensus.1.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>''); @@ -694,6 +727,8 @@ test_vcf_consensus($opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-I'); test_vcf_consensus($opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-H I'); test_vcf_consensus($opts,in=>'consensus.17',out=>'consensus17.1.out',fa=>'consensus2.fa',mask=>'consensus.17.bed',args=>'-M N'); +test_vcf_consensus($opts,in=>'consensus.18',out=>'consensus18.1.out',fa=>'consensus.18.fa',args=>''); +test_vcf_consensus($opts,in=>'consensus.19',out=>'consensus19.1.out',fa=>'consensus.19.fa',args=>''); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite test_mpileup($opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite @@ -717,6 +752,10 @@ test_mpileup($opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.3.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75 --ambig-reads incAD]); test_mpileup($opts,in=>[qw(indel-AD.2)],out=>'mpileup/indel-AD.4.out',ref=>'indel-AD.2.fa',args=>q[-a AD -r 11:75 --ambig-reads incAD0]); test_mpileup($opts,in=>[qw(mpileup-SCR)],out=>'mpileup/mpileup-SCR.out',ref=>'mpileup-SCR.fa',args=>q[-a INFO/SCR,FMT/SCR]); +test_mpileup($opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.1.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-all-set PAIRED,PROPER_PAIR,MREVERSE]); +test_mpileup($opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.1.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-any-set READ1]); +test_mpileup($opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.2.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-all-unset READ1]); +test_mpileup($opts,in=>[qw(mpileup-filter)],out=>'mpileup/mpileup-filter.2.out',ref=>'mpileup-SCR.fa',args=>q[-t 1:100 --skip-any-unset READ1]); test_csq($opts,in=>'csq',out=>'csq.1.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.gff3'); test_csq($opts,in=>'csq',out=>'csq.1.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.chr.gff3'); test_csq($opts,in=>'csq.2',out=>'csq.2.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff',tbcsq=>1); @@ -832,7 +871,7 @@ sub parse_params $$opts{path} = cygpath($$opts{path}); $$opts{bin} = cygpath($$opts{bin}); } - + return $opts; } sub _cmd @@ -1182,6 +1221,7 @@ sub test_vcf_query { my ($opts,%args) = @_; bgzip_tabix_vcf($opts,$args{in}); + $args{args} =~ s/{PATH}/$$opts{path}/g; test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools query $args{args} $$opts{tmp}/$args{in}.vcf.gz", exp_fix=>1); test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view -Ob $$opts{tmp}/$args{in}.vcf.gz | $$opts{bin}/bcftools query $args{args}", exp_fix=>1); } @@ -1258,6 +1298,52 @@ sub test_vcf_64bit test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools view $$opts{path}/$args{in}.vcf -Ou | $$opts{bin}/bcftools view -H", exp_fix=>1); } } +sub gen_head_output +{ + my ($h, $n, $desc, $infile) = @_; + + my $expected = ""; + open my $in, '<', $infile or die "Couldn't open $infile: $!\n"; + while (<$in>) { + my $counter = /^#/? \$h : \$n; + next unless $$counter > 0; + $expected .= $_; + $$counter--; + } + close $in; + return (out => "head.$desc.out", exp => $expected); +} +sub test_vcf_head +{ + my ($opts, %args) = @_; + + my $infile = "$$opts{path}/$args{in}"; + + test_cmd($opts, %args, gen_head_output(1000, 0, "all", $infile), + cmd => "$$opts{bin}/bcftools head $infile"); + test_cmd($opts, %args, gen_head_output(0, 0, "none", $infile), + cmd => "$$opts{bin}/bcftools head -h 0 $infile"); + test_cmd($opts, %args, gen_head_output(1, 0, "one", $infile), + cmd => "$$opts{bin}/bcftools head -h 1 $infile"); + test_cmd($opts, %args, gen_head_output(5, 0, "five", $infile), + cmd => "$$opts{bin}/bcftools head -h 5 $infile"); + + my $nh = $args{in_nheaders}; # Test exactly the number of headers + test_cmd($opts, %args, gen_head_output($nh, 0, "exact", $infile), + cmd => "$$opts{bin}/bcftools head -h $nh $infile"); + $nh++; # Also test asking for one more line than there are headers + test_cmd($opts, %args, gen_head_output($nh, 0, "toomany", $infile), + cmd => "$$opts{bin}/bcftools head -h $nh $infile"); + + test_cmd($opts, %args, gen_head_output(1000, 0, "alln0", $infile), + cmd => "$$opts{bin}/bcftools head -n 0 $infile"); + test_cmd($opts, %args, gen_head_output(1000, 1, "onerec", $infile), + cmd => "$$opts{bin}/bcftools head -n 1 $infile"); + test_cmd($opts, %args, gen_head_output(1000, 5, "fiverecs", $infile), + cmd => "$$opts{bin}/bcftools head -n 5 $infile"); + test_cmd($opts, %args, gen_head_output(5, 5, "fiveboth", $infile), + cmd => "$$opts{bin}/bcftools head -h 5 -n 5 < $infile"); +} sub test_vcf_call { my ($opts,%args) = @_; @@ -1457,7 +1543,7 @@ sub test_vcf_plugin if ( !$$opts{test_plugins} ) { return; } $ENV{BCFTOOLS_PLUGINS} = "$$opts{bin}/plugins"; if ( !exists($args{args}) ) { $args{args} = ''; } - my $wpath = $$opts{path}; + my $wpath = $$opts{path}; if ($^O =~ /^msys/) { $wpath = `cygpath -w $$opts{path}`; $wpath =~ s/\r?\n//; @@ -1645,10 +1731,16 @@ sub test_mpileup my $ref = exists($args{ref}) ? $args{ref} : "mpileup.ref.fa"; $args{args} =~ s/{PATH}/$$opts{path}/g; - for my $fmt ('bam','cram') + for my $fmt ('sam','bam','cram') { + if ( $fmt eq 'sam' && ($args{args}=~/-r/ or $args{args}=~/-R/) ) { next; } my @files = (); - for my $file (@{$args{in}}) { push @files, "$$opts{path}/mpileup/$file.$fmt"; } + for my $file (@{$args{in}}) + { + if ( !-e "$$opts{path}/mpileup/$file.$fmt" ) { next; } + push @files, "$$opts{path}/mpileup/$file.$fmt"; + } + if ( !@files ) { next; } my $files = join(' ',@files); my $grep_hdr = "grep -v ^##bcftools | grep -v ^##reference"; test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools mpileup $args{args} -f $$opts{path}/mpileup/$ref $files | $grep_hdr"); diff --git a/test/view-a.1.out b/test/view-a.1.out new file mode 100644 index 000000000..7df37c5a5 --- /dev/null +++ b/test/view-a.1.out @@ -0,0 +1 @@ +chr1 36 . A C . . AC=2;AN=6 GT:GQ:MIN_DP:PL:DP:AD:VAF:DNM:VA 0/0:1:0:0,.,.:.:.:.:1:0 1/1:8:.:28,7,0:10:0,10:1:.:. 0/0:18:6:0,.,.:.:.:.:.:. diff --git a/test/view-a.vcf b/test/view-a.vcf new file mode 100644 index 000000000..5b4118674 --- /dev/null +++ b/test/view-a.vcf @@ -0,0 +1,14 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT EGAN00001406653 EGAN00001405940 EGAN00001406033 +chr1 36 . A <*>,C . . . GT:GQ:MIN_DP:PL:DP:AD:VAF:DNM:VA 0/0:1:0:0,0,0,.,.,.:.:.:.:1:0 2/2:8:.:28,990,990,7,990,0:10:0,0,10:0,1:.:. 0/0:18:6:0,18,179,.,.,.:.:.:.:.:. diff --git a/test/view.sites.1.out b/test/view.sites.1.out new file mode 100644 index 000000000..30a471653 --- /dev/null +++ b/test/view.sites.1.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10001 . G C 40 . . +1 10002 . T A 50 . . +1 10003 . A G 60 . . +1 10004 . A T 70 . . diff --git a/test/view.sites.txt b/test/view.sites.txt new file mode 100644 index 000000000..81af4e73b --- /dev/null +++ b/test/view.sites.txt @@ -0,0 +1,4 @@ +1 10001 G A +1 10002 T A +1 10003 G C +1 10004 A C diff --git a/test/view.sites.txt.gz b/test/view.sites.txt.gz new file mode 100644 index 000000000..338845aa0 Binary files /dev/null and b/test/view.sites.txt.gz differ diff --git a/test/view.sites.txt.gz.tbi b/test/view.sites.txt.gz.tbi new file mode 100644 index 000000000..f1d9b4635 Binary files /dev/null and b/test/view.sites.txt.gz.tbi differ diff --git a/test/view.sites.vcf b/test/view.sites.vcf new file mode 100644 index 000000000..30a471653 --- /dev/null +++ b/test/view.sites.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10001 . G C 40 . . +1 10002 . T A 50 . . +1 10003 . A G 60 . . +1 10004 . A T 70 . . diff --git a/tsv2vcf.c b/tsv2vcf.c index 2e1aa529f..596e75a0a 100644 --- a/tsv2vcf.c +++ b/tsv2vcf.c @@ -1,6 +1,6 @@ /* tsv2vcf.c -- convert from whitespace-separated fields to VCF - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -119,4 +119,17 @@ int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr) return 0; } +int tsv_setter_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + bcf_hdr_t *hdr = (bcf_hdr_t*)usr; + char *sb = tsv->ss; + while ( *sb && !isspace(*sb) ) sb++; + if ( !*sb ) return -1; + char tmp = *sb; + *sb = ','; + bcf_update_alleles_str(hdr, rec, tsv->ss); + *sb = tmp; + return 0; +} + diff --git a/tsv2vcf.h b/tsv2vcf.h index 6fe5b4568..68757d459 100644 --- a/tsv2vcf.h +++ b/tsv2vcf.h @@ -1,6 +1,6 @@ /* tsv2vcf.h -- convert from whitespace-separated fields to VCF - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -80,6 +80,7 @@ static inline int tsv_next(tsv_t *tsv) int tsv_setter_chrom(tsv_t *tsv, bcf1_t *rec, void *usr); int tsv_setter_pos(tsv_t *tsv, bcf1_t *rec, void *usr); int tsv_setter_id(tsv_t *tsv, bcf1_t *rec, void *usr); +int tsv_setter_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr); // usr must point to bcf_hdr_t #endif diff --git a/vcfannotate.c b/vcfannotate.c index 14ee5deb3..42cab68ca 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -159,9 +159,13 @@ typedef struct _args_t char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites; + char **rename_annots_map; + char *min_overlap_str; + float min_overlap_ann, min_overlap_vcf; + int rename_annots_nmap; kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; - int columns_is_file, has_append_mode; + int columns_is_file, has_append_mode, pair_logic; } args_t; @@ -651,6 +655,7 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da { bcf1_t *rec = (bcf1_t*) data; int i; + if ( line->n_allele>1 && (col->replace & REPLACE_MISSING) ) return 0; if ( rec->n_allele==line->n_allele ) { for (i=1; in_allele; i++) if ( strcmp(rec->d.allele[i],line->d.allele[i]) ) break; @@ -2115,6 +2120,7 @@ static char *set_replace_mode(char *ss, int *replace) *replace = mode; return ss; } +static void rename_annots_push(args_t *args, char *src, char *dst); static void init_columns(args_t *args) { int need_sample_map = 0; @@ -2164,6 +2170,7 @@ static void init_columns(args_t *args) int icol = -1, has_fmt_str = 0; while ( *ss ) { + char *ptr; if ( *se && *se!=',' ) { se++; continue; } int replace; ss = set_replace_mode(ss, &replace); @@ -2198,6 +2205,8 @@ static void init_columns(args_t *args) col->setter = vcf_setter_alt; col->hdr_key_src = strdup(str.s); col->hdr_key_dst = strdup(str.s); + col->replace = replace; + if ( args->pair_logic==-1 ) bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,BCF_SR_PAIR_BOTH_REF); } else args->alt_idx = icol; } @@ -2262,6 +2271,12 @@ static void init_columns(args_t *args) if ( bcf_hdr_id2type(args->tgts_hdr,BCF_HL_INFO,hdr_id)!=BCF_HT_STR ) error("Only Type=String tags can be used to annotate the ID column\n"); } + else if ( (ptr=strstr(str.s,":=")) && !args->targets_fname ) + { + *ptr = 0; + rename_annots_push(args,ptr+2,str.s); + *ptr = ':'; + } else if ( !strcasecmp("FILTER",str.s) ) { if ( replace & REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n"); @@ -2536,7 +2551,7 @@ static void init_columns(args_t *args) hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } if ( args->tgts_is_vcf ) @@ -2686,41 +2701,58 @@ static void rename_chrs(args_t *args, char *fname) for (i=0; ihdr_out, BCF_DT_ID, ori_tag); + if ( id<0 ) return 1; + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", ori_tag, NULL); + if ( !hrec ) return 1; // the ID attribute not present + int j = bcf_hrec_find_key(hrec, "ID"); + assert( j>=0 ); + free(hrec->vals[j]); + char *ptr = new_tag; + while ( *ptr && !isspace(*ptr) ) ptr++; + *ptr = 0; + hrec->vals[j] = strdup(new_tag); + args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + return 0; +} +static void rename_annots(args_t *args) { - int n, i; - char **map = hts_readlist(fname, 1, &n); - if ( !map ) error("Could not read: %s\n", fname); - for (i=0; irename_annots ) { - char *sb = NULL, *ss = map[i]; - while ( *ss && !isspace(*ss) ) ss++; - if ( !*ss ) error("Could not parse: %s\n", fname); - *ss = 0; - int type; - if ( !strncasecmp("info/",map[i],5) ) type = BCF_HL_INFO, sb = map[i] + 5; - else if ( !strncasecmp("format/",map[i],7) ) type = BCF_HL_FMT, sb = map[i] + 7; - else if ( !strncasecmp("fmt/",map[i],4) ) type = BCF_HL_FMT, sb = map[i] + 4; - else if ( !strncasecmp("filter/",map[i],7) ) type = BCF_HL_FLT, sb = map[i] + 7; - else error("Could not parse \"%s\", expected INFO, FORMAT, or FILTER prefix for each line: %s\n",map[i],fname); - int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, sb); - if ( id<0 ) continue; - bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL); - if ( !hrec ) continue; // the sequence not present - int j = bcf_hrec_find_key(hrec, "ID"); - assert( j>=0 ); - free(hrec->vals[j]); - ss++; - while ( *ss && isspace(*ss) ) ss++; - char *se = ss; - while ( *se && !isspace(*se) ) se++; - *se = 0; - hrec->vals[j] = strdup(ss); - args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + args->rename_annots_map = hts_readlist(args->rename_annots, 1, &args->rename_annots_nmap); + if ( !args->rename_annots_map ) error("Could not read: %s\n", args->rename_annots); } - for (i=0; irename_annots_nmap; i++) + { + char *ptr = args->rename_annots_map[i]; + while ( *ptr && !isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse: %s\n", args->rename_annots_map[i]); + char *rmme = ptr; + *ptr = 0; + ptr++; + while ( *ptr && isspace(*ptr) ) ptr++; + if ( !*ptr ) { *rmme = ' '; error("Could not parse: %s\n", args->rename_annots_map[i]); } + if ( rename_annots_core(args, args->rename_annots_map[i], ptr) < 0 ) + error("Could not parse \"%s %s\", expected INFO, FORMAT, or FILTER prefix\n",args->rename_annots_map[i],ptr); + } +} +static void rename_annots_push(args_t *args, char *src, char *dst) +{ + args->rename_annots_nmap++; + args->rename_annots_map = (char**)realloc(args->rename_annots_map,sizeof(*args->rename_annots_map)*args->rename_annots_nmap); + kstring_t str = {0,0,0}; + ksprintf(&str,"%s %s",src,dst); + args->rename_annots_map[ args->rename_annots_nmap - 1 ] = str.s; } static void init_data(args_t *args) @@ -2769,6 +2801,22 @@ static void init_data(args_t *args) args->nalines++; hts_expand0(annot_line_t,args->nalines,args->malines,args->alines); } + if ( args->min_overlap_str ) + { + char *tmp = args->min_overlap_str; + if ( args->min_overlap_str[0] != ':' ) + { + args->min_overlap_ann = strtod(args->min_overlap_str,&tmp); + if ( args->min_overlap_ann < 0 || args->min_overlap_ann > 1 || (*tmp && *tmp!=':') ) + error("Could not parse \"--min-overlap %s\", expected value(s) between 0-1\n", args->min_overlap_str); + } + if ( *tmp && *tmp==':' ) + { + args->min_overlap_vcf = strtod(tmp+1,&tmp); + if ( args->min_overlap_vcf < 0 || args->min_overlap_vcf > 1 || *tmp ) + error("Could not parse \"--min-overlap %s\", expected value(s) between 0-1\n", args->min_overlap_str); + } + } } init_merge_method(args); args->vcmp = vcmp_init(); @@ -2787,7 +2835,7 @@ static void init_data(args_t *args) if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); - if ( args->rename_annots ) rename_annots(args, args->rename_annots); + if ( args->rename_annots || args->rename_annots_map ) rename_annots(args); char wmode[8]; set_wmode(wmode,args->output_type,args->output_fname,args->clevel); @@ -2835,6 +2883,11 @@ static void destroy_data(args_t *args) regidx_destroy(args->tgt_idx); regitr_destroy(args->tgt_itr); } + if ( args->rename_annots_map ) + { + for (i=0; irename_annots_nmap; i++) free(args->rename_annots_map[i]); + free(args->rename_annots_map); + } if ( args->tgts ) bcf_sr_regions_destroy(args->tgts); free(args->tmpks.s); free(args->tmpi); @@ -2988,6 +3041,15 @@ static void annotate(args_t *args, bcf1_t *line) tmp->rid = line->rid; tmp->start = args->tgt_itr->beg; tmp->end = args->tgt_itr->end; + + // Check min overlap + int len_ann = tmp->end - tmp->start + 1; + int len_vcf = line->rlen; + int isec = (tmp->end < line->pos+line->rlen-1 ? tmp->end : line->pos+line->rlen-1) - (tmp->start > line->pos ? tmp->start : line->pos) + 1; + assert( isec > 0 ); + if ( args->min_overlap_ann && args->min_overlap_ann > (float)isec/len_ann ) continue; + if ( args->min_overlap_vcf && args->min_overlap_vcf > (float)isec/len_vcf ) continue; + parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); for (j=0; jncols; j++) { @@ -3186,11 +3248,10 @@ static void usage(args_t *args) { fprintf(stderr, "\n"); fprintf(stderr, "About: Annotate and edit VCF/BCF files.\n"); - fprintf(stderr, "Usage: bcftools annotate [options] \n"); + fprintf(stderr, "Usage: bcftools annotate [options] VCF\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(stderr, " --collapse STR Matching records by , see man page for details [some]\n"); fprintf(stderr, " -c, --columns LIST List of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); fprintf(stderr, " -C, --columns-file FILE Read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); @@ -3201,9 +3262,11 @@ static void usage(args_t *args) fprintf(stderr, " -k, --keep-sites Leave -i/-e sites unchanged instead of discarding them\n"); fprintf(stderr, " -l, --merge-logic TAG:TYPE Merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); fprintf(stderr, " -m, --mark-sites [+-]TAG Add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(stderr, " --min-overlap ANN:VCF Required overlap as a fraction of variant in the -a file (ANN), the VCF (:VCF), or reciprocal (ANN:VCF)\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"); + fprintf(stderr, " --pair-logic STR Matching records by , see man page for details [some]\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in FILE\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); @@ -3235,7 +3298,8 @@ int main_vcfannotate(int argc, char *argv[]) args->set_ids_replace = 1; args->match_id = -1; args->clevel = -1; - int regions_is_file = 0, collapse = 0; + args->pair_logic = -1; + int regions_is_file = 0; int regions_overlap = 1; static struct option loptions[] = @@ -3249,6 +3313,7 @@ int main_vcfannotate(int argc, char *argv[]) {"annotations",required_argument,NULL,'a'}, {"merge-logic",required_argument,NULL,'l'}, {"collapse",required_argument,NULL,2}, + {"pair-logic",required_argument,NULL,2}, {"include",required_argument,NULL,'i'}, {"exclude",required_argument,NULL,'e'}, {"regions",required_argument,NULL,'r'}, @@ -3263,6 +3328,7 @@ int main_vcfannotate(int argc, char *argv[]) {"samples",required_argument,NULL,'s'}, {"samples-file",required_argument,NULL,'S'}, {"single-overlaps",no_argument,NULL,10}, + {"min-overlap",required_argument,NULL,12}, {"no-version",no_argument,NULL,8}, {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} @@ -3320,25 +3386,25 @@ int main_vcfannotate(int argc, char *argv[]) case 'h': args->header_fname = optarg; break; case 1 : args->rename_chrs = optarg; break; case 2 : - if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; - else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; - else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; - else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; - else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; - else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE; - else error("The --collapse string \"%s\" not recognised.\n", optarg); + if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF; + else if ( !strcmp(optarg,"indels") ) args->pair_logic |= BCF_SR_PAIR_INDEL_REF; + else if ( !strcmp(optarg,"both") ) args->pair_logic |= BCF_SR_PAIR_BOTH_REF; + else if ( !strcmp(optarg,"any") ) args->pair_logic |= BCF_SR_PAIR_ANY; + else if ( !strcmp(optarg,"all") ) args->pair_logic |= BCF_SR_PAIR_ANY; + else if ( !strcmp(optarg,"some") ) args->pair_logic |= BCF_SR_PAIR_SOME; + else if ( !strcmp(optarg,"none") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else if ( !strcmp(optarg,"exact") ) args->pair_logic = BCF_SR_PAIR_EXACT; + else error("The --pair-logic string \"%s\" not recognised.\n", optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; case 11 : args->rename_annots = optarg; break; + case 12 : args->min_overlap_str = optarg; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } @@ -3369,9 +3435,11 @@ int main_vcfannotate(int argc, char *argv[]) { args->tgts_is_vcf = 1; args->files->require_index = 1; - args->files->collapse = collapse ? collapse : COLLAPSE_SOME; + bcf_sr_set_opt(args->files,BCF_SR_PAIR_LOGIC,args->pair_logic>=0 ? args->pair_logic : BCF_SR_PAIR_SOME); + if ( args->min_overlap_str ) error("The --min-overlap option cannot be used when annotating from a VCF\n"); } } + if ( args->min_overlap_str && args->single_overlaps ) error("The options --single-overlaps and --min-overlap cannot be combined\n"); if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n"); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to read from %s: %s\n", !strcmp("-",fname)?"standard input":fname,bcf_sr_strerror(args->files->errnum)); diff --git a/vcfcall.c b/vcfcall.c index ca2a89908..0418d8e0f 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -1071,10 +1071,8 @@ int main_vcfcall(int argc, char *argv[]) case 9 : args.n_threads = strtol(optarg, 0, 0); break; case 8 : args.record_cmd_line = 0; break; case 4 : - if ( !strcasecmp(optarg,"0") ) args.regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args.regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args.regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args.regions_overlap = parse_overlap_option(optarg); + if ( args.regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; default: usage(&args); } diff --git a/vcfcnv.c b/vcfcnv.c index 02f56b97f..0302261d5 100644 --- a/vcfcnv.c +++ b/vcfcnv.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2021 Genome Research Ltd. + Copyright (c) 2014-2022 Genome Research Ltd. Author: Petr Danecek @@ -1131,8 +1131,6 @@ static int parse_lrr_baf(sample_t *smpl, bcf_fmt_t *baf_fmt, bcf_fmt_t *lrr_fmt, return *baf<0 ? 0 : 1; } -int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq); - static void cnv_next_line(args_t *args, bcf1_t *line) { if ( !line ) @@ -1381,16 +1379,12 @@ int main_vcfcnv(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(args); break; diff --git a/vcfconcat.c b/vcfconcat.c index 50013a1fe..0246b59f5 100644 --- a/vcfconcat.c +++ b/vcfconcat.c @@ -1014,10 +1014,8 @@ int main_vcfconcat(int argc, char *argv[]) case 8 : args->record_cmd_line = 0; break; case 7 : args->naive_concat = 1; args->naive_concat_trust_headers = 1; break; case 12 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 'v': args->verbose = strtol(optarg, 0, 0); diff --git a/vcfconvert.c b/vcfconvert.c index c0fddac51..4a5d7bab1 100644 --- a/vcfconvert.c +++ b/vcfconvert.c @@ -64,7 +64,7 @@ struct _args_t kstring_t str; int32_t *gts; float *flt; - int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col; + int rev_als, output_vcf_ids, hap2dip, gen_3N6; int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type; int regions_overlap, targets_overlap; char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns; @@ -139,54 +139,83 @@ static void open_vcf(args_t *args, const char *format_str) free(samples); } -static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +// Try to set CHROM:POS_REF_ALT[_END]. Return 0 on success, -1 on error +static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) { args_t *args = (args_t*) usr; char tmp, *se = tsv->ss, *ss = tsv->ss; while ( se < tsv->se && *se!=':' ) se++; - if ( *se!=':' ) error("Could not parse CHROM in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( *se!=':' ) return -1; tmp = *se; *se = 0; - rec->rid = bcf_hdr_name2id(args->header,ss); - if ( rec->rid<0 ) error("Could not determine sequence name or multiple sequences present: %s\n", tsv->ss); + int rid = bcf_hdr_name2id(args->header,ss); *se = tmp; + if ( rid<0 ) return -1; // POS - rec->pos = strtol(se+1,&ss,10); - if ( ss==se+1 ) error("Could not parse POS in CHROM:POS_REF_ALT: %s\n", tsv->ss); - rec->pos--; - - // ID - if ( args->output_vcf_ids ) - { - char tmp = *tsv->se; - *tsv->se = 0; - bcf_update_id(args->header, rec, tsv->ss); - *tsv->se = tmp; - } + hts_pos_t pos = strtol(se+1,&ss,10); + if ( ss==se+1 ) return -1; + pos--; // REF,ALT args->str.l = 0; se = ++ss; while ( se < tsv->se && *se!='_' ) se++; - if ( *se!='_' ) error("Could not parse REF in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( *se!='_' ) return -1; kputsn(ss,se-ss,&args->str); ss = ++se; while ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) se++; - if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) error("Could not parse ALT in CHROM:POS_REF_ALT id: %s\n", tsv->ss); + if ( se < tsv->se && *se!='_' && isspace(*tsv->se) ) return -1; kputc(',',&args->str); kputsn(ss,se-ss,&args->str); - bcf_update_alleles_str(args->header, rec, args->str.s); // END - optional - if (*se && *se=='_') { + if (*se && *se=='_') + { long end = strtol(se+1,&ss,10); - if ( ss==se+1 ) error("Could not parse END in CHROM:POS_REF_ALT_END: %s\n", tsv->ss); + if ( ss==se+1 ) return -1; bcf_update_info_int32(args->header, rec, "END", &end, 1); } + rec->rid = rid; + rec->pos = pos; + bcf_update_alleles_str(args->header, rec, args->str.s); + return 0; } +static int tsv_setter_chrom_pos_ref_alt_or_chrom(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + int ret = _set_chrom_pos_ref_alt(tsv,rec,usr); + if ( !ret ) return ret; + return tsv_setter_chrom(tsv,rec,args->header); +} +static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + int ret = _set_chrom_pos_ref_alt(tsv,rec,usr); + if ( ret!=0 ) error("Could not parse the CHROM:POS_REF_ALT[_END] string: %s\n", tsv->ss); + return ret; +} +// This function must be called first, then tsv_setter_chrom_pos_ref_alt_id_or_die. +// One of them is expected to find the CHROM:POS_REF_ALT[_END] string, if not, die. +static int tsv_setter_chrom_pos_ref_alt_or_id(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + if ( _set_chrom_pos_ref_alt(tsv,rec,usr)==0 ) return 0; + rec->pos = -1; // mark the record as unset + if ( !args->output_vcf_ids) return 0; + return tsv_setter_id(tsv,rec,usr); +} +static int tsv_setter_chrom_pos_ref_alt_id_or_die(tsv_t *tsv, bcf1_t *rec, void *usr) +{ + args_t *args = (args_t*)usr; + if ( rec->pos!=-1 ) + { + if ( !args->output_vcf_ids ) return 0; + return tsv_setter_id(tsv,rec,usr); + } + return tsv_setter_chrom_pos_ref_alt(tsv,rec,usr); +} static int tsv_setter_verify_pos(tsv_t *tsv, bcf1_t *rec, void *usr) { char *se; @@ -334,7 +363,8 @@ static void gensample_to_vcf(args_t *args) * * Second column is expected in the form of CHROM:POS_REF_ALT. We use second * column because the first can be empty ("--") when filling sites from reference - * panel. + * panel. When the option --vcf-ids is given, the first column is used to set the + * VCF ID. * * Output: VCF with filled GT,GP * @@ -362,22 +392,29 @@ static void gensample_to_vcf(args_t *args) if ( !gen_fh ) error("Could not read: %s\n", gen_fname); if ( hts_getline(gen_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", gen_fname); - // Find out the chromosome name, sample names, init and print the VCF header + // Find out the chromosome name, depending on the format variant (--3N6 or plain) and the ordering + // of the columns (CHROM:POS_REF_ALT comes first or second) args->str.l = 0; - char *ss, *se = line.s; + char *sb = line.s, *se = line.s; while ( *se && !isspace(*se) ) se++; - if ( !*se ) error("Could not parse %s: %s\n", gen_fname,line.s); - ss = se+1; - se = strchr(ss,':'); - if ( !se ) error("Expected CHROM:POS_REF_ALT in second column of %s\n", gen_fname); - kputsn(ss, se-ss, &args->str); - - tsv_t *tsv = tsv_init("-,CHROM_POS_REF_ALT,POS,REF_ALT,GT_GP"); - tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); - tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); - tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args); + if ( !*se ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + if ( args->gen_3N6 ) // first column, just CHROM + kputsn(sb, se-sb, &args->str); + else // first or second column, part of CHROM:POS_REF_ALT + { + char *sc = strchr(sb,':'); + if ( !sc || sc > se ) + { + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + sb = ++se; + sc = strchr(sb,':'); + if ( !sc ) error("Could not determine CHROM in %s: %s\n", gen_fname,line.s); + } + kputsn(sb, sc-sb, &args->str); + } + // Initialize and print the VCF header, args->str.s contains the chr name args->header = bcf_hdr_init("w"); bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); @@ -385,6 +422,21 @@ static void gensample_to_vcf(args_t *args) bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + tsv_t *tsv; + if ( args->gen_3N6 ) + { + tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); + tsv_register(tsv, "CHROM", tsv_setter_chrom, args); + } + else + tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP"); + tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt_or_id, args); + tsv_register(tsv, "ID", tsv_setter_chrom_pos_ref_alt_id_or_die, args); + tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); + tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); + tsv_register(tsv, "GT_GP", tsv_setter_gt_gp, args); + + // Find out sample names int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); if ( !samples ) error("Could not read %s\n", sample_fname); @@ -456,6 +508,11 @@ static void haplegendsample_to_vcf(args_t *args) */ kstring_t line = {0,0,0}; + if ( args->output_vcf_ids ) + error( + "The option --haplegendsample2vcf cannot be combined with --vcf-ids. This is because the\n" + "ID column must be formatted as \"CHROM:POS_REF_ALT\" to check sanity of the operation\n"); + char *hap_fname = NULL, *leg_fname = NULL, *sample_fname = NULL; sample_fname = strchr(args->infname,','); if ( !sample_fname ) @@ -500,7 +557,6 @@ static void haplegendsample_to_vcf(args_t *args) tsv_register(leg_tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); tsv_register(leg_tsv, "POS", tsv_setter_verify_pos, NULL); tsv_register(leg_tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_t *hap_tsv = tsv_init("HAPS"); tsv_register(hap_tsv, "HAPS", tsv_setter_haps, args); @@ -582,7 +638,8 @@ static void hapsample_to_vcf(args_t *args) /* * Input: SHAPEIT output * - * 20:19995888_A_G 20:19995888 19995888 A G 0 0 0 0 ... + * 20:19995888_A_G rsid1 19995888 A G 0 0 0 0 ... + * 20 20:19995888_A_G 19995888 A G 0 0 0 0 ... * * First column is expected in the form of CHROM:POS_REF_ALT * @@ -612,24 +669,49 @@ static void hapsample_to_vcf(args_t *args) if ( !hap_fh ) error("Could not read: %s\n", hap_fname); if ( hts_getline(hap_fh, KS_SEP_LINE, &line) <= 0 ) error("Empty file: %s\n", hap_fname); - // Find out the chromosome name, sample names, init and print the VCF header + // Find out the chromosome name, it can be either in the first or second column args->str.l = 0; - char *se = strchr(line.s,':'); - if ( !se ) error("Expected CHROM:POS_REF_ALT in first column of %s\n", hap_fname); - kputsn(line.s, se-line.s, &args->str); - - tsv_t *tsv = tsv_init("CHROM_POS_REF_ALT,-,POS,REF_ALT,HAPS"); - tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); - tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); - tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); - tsv_register(tsv, "HAPS", tsv_setter_haps, args); + char *sb = line.s, *se = line.s; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + if ( !args->output_vcf_ids ) + { + // first column should be just CHROM, but the second must be CHROM:POS_REF_ALT, use that + sb = ++se; + while ( *se && !isspace(*se) ) se++; + if ( !*se ) error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + if ( !strchr(sb,':') ) + error("Could not determine CHROM in the second column of %s: %s\n", hap_fname,line.s); + } + // Parse CHROM:POS_REF_ALT + char *sc = strchr(sb,':'); + if ( !sc || sc > se ) + error("Could not determine CHROM in %s: %s\n", hap_fname,line.s); + kputsn(sb, sc-sb, &args->str); + // Initialize and print the VCF header, args->str.s contains the chr name args->header = bcf_hdr_init("w"); bcf_hdr_append(args->header, "##INFO="); bcf_hdr_append(args->header, "##FORMAT="); bcf_hdr_printf(args->header, "##contig=", args->str.s,0x7fffffff); // MAX_CSI_COOR if (args->record_cmd_line) bcf_hdr_append_version(args->header, args->argc, args->argv, "bcftools_convert"); + tsv_t *tsv; + if ( args->output_vcf_ids ) + { + tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,HAPS"); + tsv_register(tsv, "ID", tsv_setter_id, args); + } + else + { + tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,POS,REF_ALT,HAPS"); + tsv_register(tsv, "CHROM", tsv_setter_chrom_pos_ref_alt_or_chrom, args); + } + tsv_register(tsv, "CHROM_POS_REF_ALT", tsv_setter_chrom_pos_ref_alt, args); + tsv_register(tsv, "POS", tsv_setter_verify_pos, NULL); + tsv_register(tsv, "REF_ALT", tsv_setter_verify_ref_alt, args); + tsv_register(tsv, "HAPS", tsv_setter_haps, args); + int i, nsamples; char **samples = hts_readlist(sample_fname, 1, &nsamples); if ( !samples ) error("Could not read %s\n", sample_fname); @@ -712,13 +794,13 @@ static void vcf_to_gensample(args_t *args) kstring_t str = {0,0,0}; // insert chrom as first column if needed - if(args->output_chrom_first_col) + if ( args->gen_3N6 ) kputs("%CHROM ", &str); - else - kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); + + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); // insert rsid as second column if needed - if(args->output_vcf_ids) + if ( args->output_vcf_ids ) kputs("%ID ", &str); else kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT ", &str); @@ -993,9 +1075,9 @@ static void vcf_to_hapsample(args_t *args) // print ID instead of CHROM:POS_REF_ALT1 if ( args->output_vcf_ids ) - kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str); else - kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); @@ -1419,6 +1501,7 @@ static void usage(void) fprintf(stderr, "GEN/SAMPLE conversion (input/output from IMPUTE2):\n"); fprintf(stderr, " -G, --gensample2vcf ... |,\n"); fprintf(stderr, " -g, --gensample ... |,\n"); + fprintf(stderr, " --3N6 Use 3*N+6 column format instead of the old 3*N+5 column format\n"); fprintf(stderr, " --tag STRING Tag to take values for .gen file: GT,PL,GL,GP [GT]\n"); fprintf(stderr, " --chrom Output chromosome in first column instead of CHROM:POS_REF_ALT\n"); fprintf(stderr, " --keep-duplicates Keep duplicate positions\n"); @@ -1493,7 +1576,8 @@ int main_vcfconvert(int argc, char *argv[]) {"gensample",required_argument,NULL,'g'}, {"gensample2vcf",required_argument,NULL,'G'}, {"tag",required_argument,NULL,1}, - {"chrom",no_argument,NULL,8}, + {"chrom",no_argument,NULL,8}, + {"3N6",no_argument,NULL,15}, {"tsv2vcf",required_argument,NULL,2}, {"hapsample",required_argument,NULL,7}, {"hapsample2vcf",required_argument,NULL,3}, @@ -1532,7 +1616,8 @@ int main_vcfconvert(int argc, char *argv[]) case 5 : args->hap2dip = 1; break; case 6 : args->convert_func = gvcf_to_vcf; break; case 7 : args->convert_func = vcf_to_hapsample; args->outfname = optarg; break; - case 8 : args->output_chrom_first_col = 1; break; + case 8 : error("The --chrom option has been deprecated, please use --3N6 instead\n"); break; + case 15 : args->gen_3N6 = 1; break; case 'H': args->convert_func = haplegendsample_to_vcf; args->infname = optarg; break; case 'f': args->ref_fname = optarg; break; case 'c': args->columns = optarg; break; @@ -1561,16 +1646,12 @@ int main_vcfconvert(int argc, char *argv[]) case 11 : args->sex_fname = optarg; break; case 12 : args->keep_duplicates = 1; break; case 13 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 14 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); diff --git a/vcffilter.c b/vcffilter.c index 06b8d19e9..68d867247 100644 --- a/vcffilter.c +++ b/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -39,6 +39,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "rbuf.h" +#include "regidx.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -73,8 +74,9 @@ typedef struct _args_t htsFile *out_fh; int output_type, n_threads, clevel; - char **argv, *output_fname, *targets_list, *regions_list; - int argc, record_cmd_line; + char **argv, *output_fname, *targets_list, *regions_list, *mask_list; + int argc, record_cmd_line, mask_is_file, mask_overlap, mask_negate; + regidx_t *mask; } args_t; @@ -86,11 +88,30 @@ static void init_data(args_t *args) if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); + if ( args->mask_list ) + { + if ( args->mask_list[0]=='^' ) args->mask_negate = 1; + if ( args->mask_is_file ) + args->mask = regidx_init(args->mask_negate?args->mask_list+1:args->mask_list,NULL,NULL,0,NULL); + else + { + char *rmme = strdup(args->mask_negate?args->mask_list+1:args->mask_list), *tmp = rmme; + while ( *tmp ) + { + if ( *tmp==',' ) *tmp = '\n'; + tmp++; + } + args->mask = regidx_init_string(rmme, regidx_parse_reg, NULL, 0, NULL); + free(rmme); + } + if ( !args->mask ) + error("Could not initialize the mask: %s\n",args->mask_list); + } + args->hdr = args->files->readers[0].header; args->flt_pass = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"PASS"); assert( !args->flt_pass ); // sanity check: required by BCF spec - // -i or -e: append FILTER line - if ( args->soft_filter && args->filter_logic ) + if ( args->soft_filter && (args->filter_logic || args->mask_list) ) { kstring_t flt_name = {0,0,0}; if ( strcmp(args->soft_filter,"+") ) @@ -106,18 +127,28 @@ static void init_data(args_t *args) } while ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,id) ); } - // escape quotes + kstring_t tmp = {0,0,0}; - char *t = args->filter_str; - while ( *t ) + if ( args->filter_logic ) { - if ( *t=='"' ) kputc('\\',&tmp); - kputc(*t,&tmp); - t++; + // -i or -e: append FILTER line + ksprintf(&tmp,"Set if %s: ",args->filter_logic & FLT_INCLUDE ? "not true" : "true"); + + // escape quotes + char *t = args->filter_str; + while ( *t ) + { + if ( *t=='"' ) kputc('\\',&tmp); + kputc(*t,&tmp); + t++; + } } - int ret = bcf_hdr_printf(args->hdr, "##FILTER=", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); + else if ( args->mask_list ) + ksprintf(&tmp,"Record masked by region"); + + int ret = bcf_hdr_printf(args->hdr, "##FILTER=", flt_name.s,tmp.s); if ( ret!=0 ) - error("Failed to append header line: ##FILTER=\n", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); + error("Failed to append header line: ##FILTER=\n", flt_name.s,tmp.s); args->flt_fail = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); assert( args->flt_fail>=0 ); free(flt_name.s); free(tmp.s); @@ -174,6 +205,7 @@ static void destroy_data(args_t *args) filter_destroy(args->filter); free(args->tmpi); free(args->tmp_ac); + if ( args->mask ) regidx_destroy(args->mask); } static void flush_buffer(args_t *args, int n) @@ -403,6 +435,35 @@ static void set_genotypes(args_t *args, bcf1_t *line, int pass_site) if ( has_ac ) bcf_update_info_int32(args->hdr,line,"AC",args->tmp_ac,line->n_allele-1); } +static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t off; + if ( rec->n_allele ) + { + off = rec->rlen; + bcf_unpack(rec, BCF_UN_STR); + int i; + for (i=1; in_allele; i++) + { + // Make symbolic alleles start at POS, although this is not strictly true for + // , where POS should be the position BEFORE the deletion/insertion. + // However, since arbitrary symbolic alleles can be defined by the user, we + // will simplify the interpretation of --targets-overlap and --region-overlap. + int j = 0; + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[i]; + while ( ref[j] && alt[j] && ref[j]==alt[j] ) j++; + if ( off > j ) off = j; + if ( !off ) break; + } + } + else + off = 0; + + *beg = rec->pos + off; + *end = rec->pos + rec->rlen - 1; +} + static void usage(args_t *args) { fprintf(stderr, "\n"); @@ -414,6 +475,9 @@ static void usage(args_t *args) fprintf(stderr, " -g, --SnpGap INT[:TYPE] Filter SNPs within base pairs of an indel (the default) or any combination of indel,mnp,bnd,other,overlap\n"); fprintf(stderr, " -G, --IndelGap INT Filter clusters of indels separated by or fewer base pairs allowing only one to pass\n"); fprintf(stderr, " -i, --include EXPR Include only sites for which the expression is true (see man page for details\n"); + fprintf(stderr, " --mask [^]REGION Soft filter regions, \"^\" to negate\n"); + fprintf(stderr, " -M, --mask-file [^]FILE Soft filter regions listed in a file, \"^\" to negate\n"); + fprintf(stderr, " --mask-overlap 0|1|2 Mask if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " -m, --mode [+x] \"+\": do not replace but add to existing FILTER; \"x\": reset filters at sites which pass\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); @@ -445,11 +509,15 @@ int main_vcffilter(int argc, char *argv[]) int regions_is_file = 0, targets_is_file = 0; int regions_overlap = 1; int targets_overlap = 0; + args->mask_overlap = 1; static struct option loptions[] = { {"set-GTs",required_argument,NULL,'S'}, {"mode",required_argument,NULL,'m'}, + {"mask",required_argument,NULL,10}, + {"mask-file",required_argument,NULL,'M'}, + {"mask-overlap",required_argument,NULL,11}, {"soft-filter",required_argument,NULL,'s'}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, @@ -542,16 +610,20 @@ int main_vcffilter(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); + break; + case 10 : args->mask_list = optarg; break; + case 'M' : args->mask_list = optarg; args->mask_is_file = 1; break; + case 11 : + if ( !strcasecmp(optarg,"0") ) args->mask_overlap = 0; + else if ( !strcasecmp(optarg,"1") ) args->mask_overlap = 1; + else if ( !strcasecmp(optarg,"2") ) args->mask_overlap = 2; + else error("Could not parse: --mask-overlap %s\n",optarg); break; case 'h': case '?': usage(args); break; @@ -568,6 +640,8 @@ int main_vcffilter(int argc, char *argv[]) } else fname = argv[optind]; + if ( args->mask_list && !args->soft_filter ) error("The option --soft-filter is required with --mask and --mask-file options\n"); + // read in the regions from the command line if ( args->regions_list ) { @@ -607,6 +681,16 @@ int main_vcffilter(int argc, char *argv[]) pass = filter_test(args->filter, line, &args->smpl_pass); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; } + if ( args->mask ) + { + hts_pos_t beg, end; + if ( args->mask_overlap==0 ) beg = end = line->pos; + else if ( args->mask_overlap==1 ) beg = line->pos, end = line->pos + line->rlen - 1; + else _set_variant_boundaries(line,&beg,&end); + int mpass = regidx_overlap(args->mask,bcf_seqname(args->hdr,line),beg,end,NULL) ? 0 : 1; + if ( args->mask_negate ) mpass = mpass ? 0 : 1; + pass &= mpass; + } if ( args->soft_filter || args->set_gts || pass ) { if ( pass ) diff --git a/vcfgtcheck.c b/vcfgtcheck.c index 4d36b91db..f646e1f6d 100644 --- a/vcfgtcheck.c +++ b/vcfgtcheck.c @@ -1214,16 +1214,12 @@ int main_vcfgtcheck(int argc, char *argv[]) case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 7 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 8 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(); break; diff --git a/vcfhead.c b/vcfhead.c new file mode 100644 index 000000000..20be2a947 --- /dev/null +++ b/vcfhead.c @@ -0,0 +1,133 @@ +/* vcfhead.c -- view VCF/BCF file headers. + + Copyright (C) 2021 University of Glasgow. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "bcftools.h" + +int main_vcfhead(int argc, char *argv[]) +{ + static const char usage[] = +"\n" +"About: Displays VCF/BCF headers and optionally the first few variant records\n" +"Usage: bcftools head [OPTION]... [FILE]\n" +"\n" +"Options:\n" +" -h, --headers INT Display INT header lines [all]\n" +" -n, --records INT Display INT variant record lines [none]\n" +"\n"; + + static const struct option loptions[] = { + { "headers", required_argument, NULL, 'h' }, + { "records", required_argument, NULL, 'n' }, + { NULL, 0, NULL, 0 } + }; + + int all_headers = 1; + uint64_t nheaders = 0; + uint64_t nrecords = 0; + + int c, nargs; + while ((c = getopt_long(argc, argv, "h:n:", loptions, NULL)) >= 0) + switch (c) { + case 'h': all_headers = 0; nheaders = strtoull(optarg, NULL, 0); break; + case 'n': nrecords = strtoull(optarg, NULL, 0); break; + default: + fputs(usage, stderr); + return EXIT_FAILURE; + } + + nargs = argc - optind; + if (nargs == 0 && isatty(STDIN_FILENO)) { + fputs(usage, stdout); + return EXIT_SUCCESS; + } + else if (nargs > 1) { + fputs(usage, stderr); + return EXIT_FAILURE; + } + + const char *fname = (nargs == 1)? argv[optind] : "-"; + vcfFile *fp = bcf_open(fname, "r"); + if (fp == NULL) { + if (strcmp(fname, "-") != 0) + error_errno("[%s] Can't open \"%s\"", __func__, fname); + else + error_errno("[%s] Can't open standard input", __func__); + } + + bcf_hdr_t *hdr = bcf_hdr_read(fp); + if (hdr == NULL) { + bcf_close(fp); + if (strcmp(fname, "-") != 0) + error("[%s] Can't read headers from \"%s\"\n", __func__, fname); + else + error("[%s] Can't read headers\n", __func__); + } + + kstring_t str = KS_INITIALIZE; + + if (all_headers) { + bcf_hdr_format(hdr, 0, &str); + fputs(ks_str(&str), stdout); + } + else if (nheaders > 0) { + bcf_hdr_format(hdr, 0, &str); + char *lim = str.s; + uint64_t n; + for (n = 0; n < nheaders; n++) { + lim = strchr(lim, '\n'); + if (lim) lim++; + else break; + } + if (lim) *lim = '\0'; + fputs(ks_str(&str), stdout); + } + + if (nrecords > 0) { + bcf1_t *rec = bcf_init(); + uint64_t n; + for (n = 0; n < nrecords && bcf_read(fp, hdr, rec) >= 0; n++) { + ks_clear(&str); + if (vcf_format(hdr, rec, &str) >= 0) + fputs(ks_str(&str), stdout); + else + fprintf(stderr, "[%s] Record #%"PRIu64 " is invalid\n", __func__, n+1); + } + bcf_destroy(rec); + } + + ks_free(&str); + bcf_hdr_destroy(hdr); + bcf_close(fp); + + return EXIT_SUCCESS; +} diff --git a/vcfisec.c b/vcfisec.c index acc188595..c4c09f50d 100644 --- a/vcfisec.c +++ b/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -182,6 +182,7 @@ void isec_vcf(args_t *args) } ret |= 1<isec_op) { @@ -598,16 +599,12 @@ int main_vcfisec(int argc, char *argv[]) } break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/vcfmerge.c b/vcfmerge.c index f87bce7c3..92aaa8030 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -814,7 +814,7 @@ void maux_expand1(buffer_t *buf, int size) buf->mrec = size; } } -void maux_reset(maux_t *ma) +void maux_reset(maux_t *ma, int *rid_tab) { int i,j; for (i=0; in; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1); @@ -846,7 +846,10 @@ void maux_reset(maux_t *ma) for (i=0; in; i++) { bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i); - ma->buf[i].rid = bcf_hdr_name2id(hdr,chr); + if (new_chr) + rid_tab[i] = bcf_hdr_name2id(hdr,chr); + + ma->buf[i].rid = rid_tab[i]; ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1; for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++) { @@ -1267,7 +1270,12 @@ void merge_info(args_t *args, bcf1_t *out) bcf_info_t *inf = &line->d.info[j]; const char *key = hdr->id[BCF_DT_ID][inf->key].key; - if ( !args->keep_AC_AN && (!strcmp("AC",key) || !strcmp("AN",key)) ) continue; // AC and AN are done in merge_format() after genotypes are done + // AC and AN are done in merge_format() after genotypes are done + if (!args->keep_AC_AN && + (key[0] == 'A' + && (key[1] == 'C' || key[1] == 'N') + && key[2] == 0)) + continue; int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, key); if ( id==-1 ) error("Error: The INFO field is not defined in the header: %s\n", key); @@ -3060,13 +3068,17 @@ void merge_vcf(args_t *args) args->out_line = bcf_init1(); args->tmph = kh_init(strdict); + int *rid_tab = calloc(args->maux->n, sizeof(*rid_tab)); + if (!rid_tab) + error("[%s:%d] Could not allocate %zu bytes\n", __FILE__, __LINE__, args->maux->n*sizeof(*rid_tab)); + while ( bcf_sr_next_line(args->files) ) { // output cached gVCF blocks which end before the new record if ( args->do_gvcf ) gvcf_flush(args,0); - maux_reset(args->maux); + maux_reset(args->maux, rid_tab); // determine which of the new records are gvcf blocks if ( args->do_gvcf ) @@ -3080,6 +3092,7 @@ void merge_vcf(args_t *args) clean_buffer(args); // debug_state(args); } + free(rid_tab); if ( args->do_gvcf ) gvcf_flush(args,1); @@ -3226,10 +3239,8 @@ int main_vcfmerge(int argc, char *argv[]) case 2 : args->header_only = 1; break; case 3 : args->force_samples = 1; break; case 4 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/vcfnorm.c b/vcfnorm.c index 8a140fbec..460a757a5 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -2243,16 +2243,12 @@ int main_vcfnorm(int argc, char *argv[]) case 8 : args->record_cmd_line = 0; break; case 7 : args->force = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 'h': case '?': usage(); break; diff --git a/vcfplugin.c b/vcfplugin.c index 765619234..45686680a 100644 --- a/vcfplugin.c +++ b/vcfplugin.c @@ -455,11 +455,22 @@ static int cmp_plugin_name(const void *p1, const void *p2) return strcmp(a->name,b->name); } +// If args=NULL then returns the number of plugins available. Otherwise prints the +// plugins on stdout and returns 0 on success. static int list_plugins(args_t *args) { plugin_t *plugins = NULL; int nplugins = 0, mplugins = 0; + int count_only = 0; + args_t _args; + if ( !args ) + { + memset(&_args,0,sizeof(_args)); + args = &_args; + args->nplugin_paths = -1; + count_only = 1; + } init_plugin_paths(args); kstring_t str = {0,0,0}; @@ -490,6 +501,11 @@ static int list_plugins(args_t *args) } closedir(dp); } + if ( count_only ) + { + free(str.s); + return nplugins; + } if ( nplugins ) { qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name); @@ -508,6 +524,10 @@ static int list_plugins(args_t *args) free(str.s); return nplugins ? 0 : 1; } +int count_plugins(void) +{ + return list_plugins(NULL); +} static void init_data(args_t *args) { @@ -694,16 +714,12 @@ int main_plugin(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->plist_only = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/vcfquery.c b/vcfquery.c index 882c3bba3..70b5f3061 100644 --- a/vcfquery.c +++ b/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -37,6 +37,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "convert.h" +#include "smpl_ilist.h" // Logic of the filters: include or exclude sites which match the filters? @@ -52,9 +53,9 @@ typedef struct convert_t *convert; bcf_srs_t *files; bcf_hdr_t *header; - int nsamples, *samples, sample_is_file; + int sample_is_file; char **argv, *format_str, *sample_list, *targets_list, *regions_list, *vcf_list, *fn_out; - int argc, list_columns, print_header, allow_undef_tags; + int argc, list_columns, print_header, allow_undef_tags, force_samples; FILE *out; } args_t; @@ -76,28 +77,21 @@ static void init_data(args_t *args) { for (i=0; ifiles->nreaders; i++) { + // This tells htslib to subset samples directly when reading. Also the header is modified to + // include only the requested samples int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file); if ( ret<0 ) error("Error parsing the sample list\n"); - else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret); + else if ( ret>0 && !args->force_samples ) + error("Error: sample #%d not found in the header, user --force-samples to proceed anyway\n", ret); } - if ( args->sample_list[0]!='^' ) - { - // the sample ordering may be different if not negated - int n; - char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); - if ( !smpls ) error("Could not parse %s\n", args->sample_list); - if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) ) - error("The number of samples does not match, perhaps some are present multiple times?\n"); - nsamples = bcf_hdr_nsamples(args->files->readers[0].header); - samples = (int*) malloc(sizeof(int)*nsamples); - for (i=0; ifiles->readers[0].header, BCF_DT_SAMPLE,smpls[i]); - free(smpls[i]); - } - free(smpls); - } + int flags = SMPL_REORDER; + smpl_ilist_t *ilist = smpl_ilist_init(args->files->readers[0].header, args->sample_list, args->sample_is_file, flags); + nsamples = ilist->n; + samples = (int*) malloc(sizeof(int)*nsamples); + for (i=0; in; i++) + samples[i] = ilist->idx[i]; + smpl_ilist_destroy(ilist); } args->convert = convert_init(args->header, samples, nsamples, args->format_str); convert_set_option(args->convert, subset_samples, &args->smpl_pass); @@ -118,7 +112,6 @@ static void destroy_data(args_t *args) convert_destroy(args->convert); if ( args->filter ) filter_destroy(args->filter); - free(args->samples); } static void query_vcf(args_t *args) @@ -175,21 +168,35 @@ static void query_vcf(args_t *args) static void list_columns(args_t *args) { + int negate = 0; + int i; + bcf_sr_t *reader = &args->files->readers[0]; void *has_sample = NULL; if ( args->sample_list ) { + if ( args->sample_list[0]=='^' ) negate = 1; has_sample = khash_str2int_init(); int i, nsmpl; - char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); - for (i=0; isample_list+1 : args->sample_list, args->sample_is_file, &nsmpl); + if ( !smpl ) error("Error: failed to read %s\n", negate ? args->sample_list+1 : args->sample_list); + for (i=0; iheader,BCF_DT_SAMPLE,smpl[i])<0 && !args->force_samples ) + error("Error: sample #%d not found in the header, user --force-samples to proceed anyway\n", i+1); + khash_str2int_inc(has_sample, smpl[i]); + } free(smpl); } - int i; - bcf_sr_t *reader = &args->files->readers[0]; for (i=0; iheader); i++) { - if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue; + int skip = 0; + if ( negate ) + { + if ( khash_str2int_has_key(has_sample, reader->header->samples[i]) ) skip = 1; + } + else if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) skip = 1; + if ( skip ) continue; printf("%s\n", reader->header->samples[i]); } @@ -222,6 +229,7 @@ static void usage(void) fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(stderr, " -f, --format STRING See man page for details\n"); fprintf(stderr, " -H, --print-header Print header\n"); fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); @@ -260,6 +268,7 @@ int main_vcfquery(int argc, char *argv[]) {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"format",1,0,'f'}, + {"force-samples",0,0,3}, {"output-file",1,0,'o'}, {"output",1,0,'o'}, {"regions",1,0,'r'}, @@ -318,17 +327,14 @@ int main_vcfquery(int argc, char *argv[]) case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 1 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 2 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; + case 3 : args->force_samples = 1; break; case 'h': case '?': usage(); break; default: error("Unknown argument: %s\n", optarg); @@ -397,6 +403,7 @@ int main_vcfquery(int argc, char *argv[]) int i, k, nfiles, prev_nsamples = 0; char **fnames, **prev_samples = NULL; fnames = hts_readlist(args->vcf_list, 1, &nfiles); + if ( !fnames ) error("Error: failed to read %s\n", args->vcf_list); if ( !nfiles ) error("No files in %s?\n", args->vcf_list); for (i=0; i @@ -658,10 +658,10 @@ static void flush_viterbi(args_t *args, int ismpl) } } -int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) +int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { if ( tgt->nals < 2 ) - error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s); + error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", tgt->fname,tgt->line.s); if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match int i; @@ -841,7 +841,7 @@ int process_line(args_t *args, bcf1_t *line, int ial) else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args, args->files->targets, line, &alt_freq); + ret = read_AF(args->files->targets, line, &alt_freq); } else if ( args->dflt_AF > 0 ) { @@ -1208,16 +1208,12 @@ int main_vcfroh(int argc, char *argv[]) case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 6 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 7 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'V': diff --git a/vcfstats.c b/vcfstats.c index c13b3e3fc..b286c905d 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -72,7 +72,7 @@ idist_t; typedef struct { - uint32_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; + uint64_t n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts; int *af_ts, *af_tv, *af_snps; // first bin of af_* stats are singletons #if HWE_STATS int *af_hwe; @@ -107,7 +107,7 @@ typedef struct { uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats /* - Pearson's R^2 is used for aggregate R^2 + Pearson's R^2 is used for aggregate R^2 y, yy .. sum of dosage and squared dosage in the query VCF (second file) x, xx .. sum of squared dosage in the truth VCF (first file) n .. number of genotypes @@ -436,7 +436,7 @@ static void init_stats(args_t *args) else { args->af_bins = bin_init(args->af_bins_list,0,1); - + // m_af is used also for other af arrays, where the first bin is for // singletons. However, since the last element is unused in af_bins // (n boundaries form n-1 intervals), the m_af count is good for both. @@ -892,7 +892,7 @@ static inline void update_dvaf(stats_t *stats, bcf1_t *line, bcf_fmt_t *fmt, int else if ( len > stats->m_indel ) len = stats->m_indel; int bin = stats->m_indel + len; stats->nvaf[bin]++; - stats->dvaf[bin] += dvaf; + stats->dvaf[bin] += dvaf; } static void do_sample_stats(args_t *args, stats_t *stats, bcf_sr_t *reader, int matched) @@ -1199,7 +1199,7 @@ static void do_vcf_stats(args_t *args) do_sample_stats(args, stats, reader, ret); if ( bcf_get_info_int32(reader->header,line,"DP",&args->tmp_iaf,&args->ntmp_iaf)==1 ) - (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; + (*idist(&stats->dp_sites, args->tmp_iaf[0]))++; } } @@ -1270,14 +1270,14 @@ static void print_stats(args_t *args) for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - printf("SN\t%d\tnumber of records:\t%u\n", id, stats->n_records); - printf("SN\t%d\tnumber of no-ALTs:\t%u\n", id, stats->n_noalts); - printf("SN\t%d\tnumber of SNPs:\t%u\n", id, stats->n_snps); - printf("SN\t%d\tnumber of MNPs:\t%u\n", id, stats->n_mnps); - printf("SN\t%d\tnumber of indels:\t%u\n", id, stats->n_indels); - printf("SN\t%d\tnumber of others:\t%u\n", id, stats->n_others); - printf("SN\t%d\tnumber of multiallelic sites:\t%u\n", id, stats->n_mals); - printf("SN\t%d\tnumber of multiallelic SNP sites:\t%u\n", id, stats->n_snp_mals); + printf("SN\t%d\tnumber of records:\t%"PRIu64"\n", id, stats->n_records); + printf("SN\t%d\tnumber of no-ALTs:\t%"PRIu64"\n", id, stats->n_noalts); + printf("SN\t%d\tnumber of SNPs:\t%"PRIu64"\n", id, stats->n_snps); + printf("SN\t%d\tnumber of MNPs:\t%"PRIu64"\n", id, stats->n_mnps); + printf("SN\t%d\tnumber of indels:\t%"PRIu64"\n", id, stats->n_indels); + printf("SN\t%d\tnumber of others:\t%"PRIu64"\n", id, stats->n_others); + printf("SN\t%d\tnumber of multiallelic sites:\t%"PRIu64"\n", id, stats->n_mals); + printf("SN\t%d\tnumber of multiallelic SNP sites:\t%"PRIu64"\n", id, stats->n_snp_mals); } printf("# TSTV, transitions/transversions:\n# TSTV\t[2]id\t[3]ts\t[4]tv\t[5]ts/tv\t[6]ts (1st ALT)\t[7]tv (1st ALT)\t[8]ts/tv (1st ALT)\n"); for (id=0; idnstats; id++) @@ -1419,7 +1419,7 @@ static void print_stats(args_t *args) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); - const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n"; + const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n" : "USR:%s/%d\t%d\t%.0f\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n"; printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } @@ -1489,7 +1489,7 @@ static void print_stats(args_t *args) for (k=0; k<4; k++) { n += stats[i].gt2gt[j][k]; - if ( j==k ) + if ( j==k ) { nrd_m[j] += stats[i].gt2gt[j][k]; m[j] += stats[i].gt2gt[j][k]; @@ -1512,8 +1512,8 @@ static void print_stats(args_t *args) } double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1); printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', af); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]); + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]); if ( stats[i].n && !isnan(r2) ) printf("\t%f", r2); else printf("\t"NA_STRING); printf("\t%.0f\n", stats[i].n); @@ -1571,11 +1571,11 @@ static void print_stats(args_t *args) r2 *= r2; } printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]); - printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", + printf("\t%"PRIu64"\t%"PRIu64"\t%"PRIu64"", stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)], stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]); @@ -1631,7 +1631,7 @@ static void print_stats(args_t *args) printf("GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]); for (j=0; j<5; j++) for (k=0; k<5; k++) - printf("\t%"PRId64, stats[i].gt2gt[j][k]); + printf("\t%"PRIu64, stats[i].gt2gt[j][k]); printf("\n"); } } @@ -1650,8 +1650,8 @@ static void print_stats(args_t *args) if ( i==0 ) printf("<%d", stats->dp.min); else if ( i+1==stats->dp.m_vals ) printf(">%d", stats->dp.max); else printf("%d", idist_i2bin(&stats->dp,i)); - printf("\t%"PRId64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); - printf("\t%"PRId64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); + printf("\t%"PRIu64"\t%f", stats->dp.vals[i], sum ? stats->dp.vals[i]*100./sum : 0); + printf("\t%"PRIu64"\t%f\n", stats->dp_sites.vals[i], sum_sites ? stats->dp_sites.vals[i]*100./sum_sites : 0); } } @@ -1851,16 +1851,12 @@ int main_vcfstats(int argc, char *argv[]) if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 3 : - if ( !strcasecmp(optarg,"0") ) regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + regions_overlap = parse_overlap_option(optarg); + if ( regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 4 : - if ( !strcasecmp(optarg,"0") ) targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + targets_overlap = parse_overlap_option(optarg); + if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': diff --git a/vcfview.c b/vcfview.c index 1dbcc6173..cc0205814 100644 --- a/vcfview.c +++ b/vcfview.c @@ -716,16 +716,12 @@ int main_vcfview(int argc, char *argv[]) break; } case 2 : - if ( !strcasecmp(optarg,"0") ) args->targets_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->targets_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->targets_overlap = 2; - else error("Could not parse: --targets-overlap %s\n",optarg); + args->targets_overlap = parse_overlap_option(optarg); + if ( args->targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg); break; case 3 : - if ( !strcasecmp(optarg,"0") ) args->regions_overlap = 0; - else if ( !strcasecmp(optarg,"1") ) args->regions_overlap = 1; - else if ( !strcasecmp(optarg,"2") ) args->regions_overlap = 2; - else error("Could not parse: --regions-overlap %s\n",optarg); + args->regions_overlap = parse_overlap_option(optarg); + if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; diff --git a/version.c b/version.c index 73e0b046c..4306d4011 100644 --- a/version.c +++ b/version.c @@ -69,7 +69,7 @@ const char *hts_bcf_wmode(int file_type) return "w"; // uncompressed VCF } -const char *hts_bcf_wmode2(int file_type, char *fname) +const char *hts_bcf_wmode2(int file_type, const char *fname) { if ( !fname ) return hts_bcf_wmode(file_type); int len = strlen(fname); @@ -80,7 +80,7 @@ const char *hts_bcf_wmode2(int file_type, char *fname) return hts_bcf_wmode(file_type); } -void set_wmode(char dst[8], int file_type, char *fname, int clevel) +void set_wmode(char dst[8], int file_type, const char *fname, int clevel) { const char *ret = NULL; int len = fname ? strlen(fname) : 0; @@ -100,3 +100,10 @@ void set_wmode(char dst[8], int file_type, char *fname, int clevel) strcpy(dst, ret); } +int parse_overlap_option(const char *arg) +{ + if ( strcasecmp(arg, "pos") == 0 || strcmp(arg, "0") == 0 ) return 0; + else if ( strcasecmp(arg, "record") == 0 || strcmp(arg, "1") == 0 ) return 1; + else if ( strcasecmp(arg, "variant") == 0 || strcmp(arg, "2") == 0 ) return 2; + else return -1; +} diff --git a/version.sh b/version.sh index 1bcfceafa..28dd9e6f8 100755 --- a/version.sh +++ b/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.14 +VERSION=1.15 # If we have a git clone, then check against the current tag if [ -e .git ]