From 67c95eeff392d01fce04df2d8c802fe41401c734 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 13 Oct 2020 11:21:02 +0100 Subject: [PATCH 01/81] Bug fix: do add the NOVELGT annotation when requested --- plugins/contrast.c | 15 ++++++--------- test/contrast.1.1.out | 10 ++++++++++ test/contrast.1.2.out | 9 +++++++++ test/contrast.1.vcf | 7 +++++++ test/contrast.out | 8 +++++--- test/test.pl | 6 ++++-- 6 files changed, 41 insertions(+), 14 deletions(-) create mode 100644 test/contrast.1.1.out create mode 100644 test/contrast.1.2.out create mode 100644 test/contrast.1.vcf diff --git a/plugins/contrast.c b/plugins/contrast.c index f76f1d617..88a45cf73 100644 --- a/plugins/contrast.c +++ b/plugins/contrast.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018 Genome Research Ltd. + Copyright (c) 2018-2020 Genome Research Ltd. Author: Petr Danecek @@ -203,9 +203,9 @@ static void init_data(args_t *args) if ( args->annots & PRINT_NASSOC ) bcf_hdr_append(args->hdr_out, "##INFO="); if ( args->annots & PRINT_NOVELAL ) - bcf_hdr_append(args->hdr_out, "##INFO="); + bcf_hdr_append(args->hdr_out, "##INFO="); if ( args->annots & PRINT_NOVELGT ) - bcf_hdr_append(args->hdr_out, "##INFO="); + bcf_hdr_append(args->hdr_out, "##INFO="); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); @@ -357,13 +357,10 @@ static int process_record(args_t *args, bcf1_t *rec) has_gt = 1; char *smpl = args->hdr->samples[ args->case_smpl[i] ]; - if ( case_al ) + if ( case_al && (args->annots & PRINT_NOVELAL) ) { - if ( args->annots & PRINT_NOVELAL ) - { - if ( args->case_als_smpl.l ) kputc(',', &args->case_als_smpl); - kputs(smpl, &args->case_als_smpl); - } + if ( args->case_als_smpl.l ) kputc(',', &args->case_als_smpl); + kputs(smpl, &args->case_als_smpl); } else if ( (args->annots & PRINT_NOVELGT) && !binary_search(gt, args->control_gts, args->ncontrol_gts) ) { diff --git a/test/contrast.1.1.out b/test/contrast.1.1.out new file mode 100644 index 000000000..c44e7f1dc --- /dev/null +++ b/test/contrast.1.1.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.0 +##FILTER= +##FORMAT= +##contig= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +2 280 . T A 246 . . GT 0/0 0/0 +2 280 . T A 246 . NOVELAL=B GT 0/0 0/1 +2 280 . T A 246 . NOVELAL=B GT 0/0 1/1 diff --git a/test/contrast.1.2.out b/test/contrast.1.2.out new file mode 100644 index 000000000..c08d68837 --- /dev/null +++ b/test/contrast.1.2.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.0 +##FILTER= +##FORMAT= +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +2 280 . T A 246 . . GT 0/0 0/0 +2 280 . T A 246 . NOVELGT=B GT 0/0 0/1 +2 280 . T A 246 . NOVELGT=B GT 0/0 1/1 diff --git a/test/contrast.1.vcf b/test/contrast.1.vcf new file mode 100644 index 000000000..9bfa52ff1 --- /dev/null +++ b/test/contrast.1.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.0 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +2 280 . T A 246 . . GT 0/0 0/0 +2 280 . T A 246 . . GT 0/0 0/1 +2 280 . T A 246 . . GT 0/0 1/1 diff --git a/test/contrast.out b/test/contrast.out index b14137887..760fb60b8 100644 --- a/test/contrast.out +++ b/test/contrast.out @@ -4,8 +4,10 @@ ##contig= ##INFO= ##INFO= +##INFO= +##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a b c 1 100 . A G . . PASSOC=1;FASSOC=0,0 GT 0/0 0/0 0/0 -1 101 . A G . . PASSOC=0.333333;FASSOC=0,0.5 GT 0/0 0/0 0/1 -1 102 . A G . . PASSOC=0.4;FASSOC=0.25,1 GT 0/0 0/1 1/1 -1 103 . A G . . PASSOC=0.333333;FASSOC=1,0.5 GT 1/1 1/1 0/1 +1 101 . A G . . PASSOC=0.333333;FASSOC=0,0.5;NOVELAL=c GT 0/0 0/0 0/1 +1 102 . A G . . PASSOC=0.4;FASSOC=0.25,1;NOVELGT=c GT 0/0 0/1 1/1 +1 103 . A G . . PASSOC=0.333333;FASSOC=1,0.5;NOVELAL=c GT 1/1 1/1 0/1 diff --git a/test/test.pl b/test/test.pl index abc668f18..64b4cb2d5 100755 --- a/test/test.pl +++ b/test/test.pl @@ -474,8 +474,10 @@ test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.3.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -mx'); test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.4.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -ma'); test_vcf_plugin($opts,in=>'mendelian',out=>'mendelian.5.out',cmd=>'+mendelian',args=>'-t mom1,dad1,child1 -mu'); -test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-0 a,b -1 c'); -test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-0 {PATH}/contrast0.txt -1 {PATH}/contrast1.txt'); +test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-a PASSOC,FASSOC,NOVELAL,NOVELGT -0 a,b -1 c'); +test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-a PASSOC,FASSOC,NOVELAL,NOVELGT -0 {PATH}/contrast0.txt -1 {PATH}/contrast1.txt'); +test_vcf_plugin($opts,in=>'contrast.1',out=>'contrast.1.1.out',cmd=>'+contrast',args=>'-a NOVELAL,NOVELGT -0 A -1 B'); +test_vcf_plugin($opts,in=>'contrast.1',out=>'contrast.1.2.out',cmd=>'+contrast',args=>'-a NOVELGT -0 A -1 B'); test_vcf_plugin($opts,in=>'trio-dnm.1',out=>'trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'%CHROM[\\t%DNM]\\t[\\t%VAF]\\n'"); test_vcf_plugin($opts,in=>'trio-dnm.2',out=>'trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother --force-AD | $$opts{bin}/bcftools query -f'%CHROM[\\t%DNM]\\t[\\t%VAF]\\n'"); test_vcf_plugin($opts,in=>'trio-dnm.2',out=>'trio-dnm.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'%CHROM[\\t%DNM]\\t[\\t%VAF]\\n'"); From f57920e56bd919a26ad6ce8d55558eebe078f4cb Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 14 Oct 2020 15:40:22 +0100 Subject: [PATCH 02/81] Revise MAF definition for multiallelic sites, let the second most common allele be the minor allele. Resolves #1313 --- plugins/fill-tags.c | 46 ++++++++++++++++++++------------------- test/fill-tags-AN0.out | 2 +- test/fill-tags-hemi.1.out | 4 ++-- test/fill-tags-hemi.2.out | 4 ++-- test/fill-tags-hwe.out | 4 ++-- test/fill-tags.2.out | 4 ++-- 6 files changed, 33 insertions(+), 31 deletions(-) diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c index 4ed3f5257..f1e90c083 100644 --- a/plugins/fill-tags.c +++ b/plugins/fill-tags.c @@ -424,7 +424,7 @@ void list_tags(void) "INFO/END Number:1 Type:Integer .. End position of the variant\n" "INFO/F_MISSING Number:1 Type:Float .. Fraction of missing genotypes (all samples, experimental)\n" "INFO/HWE Number:A Type:Float .. HWE test (PMID:15789306); 1=good, 0=bad\n" - "INFO/MAF Number:A Type:Float .. Minor Allele frequency\n" + "INFO/MAF Number:1 Type:Float .. Frequency of the second most common allele\n" "INFO/NS Number:1 Type:Integer .. Number of samples with data\n" "INFO/TYPE Number:. Type:String .. The record type (REF,SNP,MNP,INDEL,etc)\n" "TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined\n" @@ -479,7 +479,7 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) if ( args->tags & SET_AC_Het ) hdr_append(args, "##INFO="); if ( args->tags & SET_AC_Hemi ) hdr_append(args, "##INFO="); if ( args->tags & SET_AF ) hdr_append(args, "##INFO="); - if ( args->tags & SET_MAF ) hdr_append(args, "##INFO="); + if ( args->tags & SET_MAF ) hdr_append(args, "##INFO="); if ( args->tags & SET_HWE ) hdr_append(args, "##INFO="); if ( args->tags & SET_END ) bcf_hdr_printf(args->out_hdr, "##INFO="); if ( args->tags & SET_TYPE ) bcf_hdr_printf(args->out_hdr, "##INFO="); @@ -581,7 +581,14 @@ static void clean_counts(pop_t *pop, int nals) pop->ns = 0; memset(pop->counts,0,sizeof(counts_t)*nals); } - +static int cmpfloat_desc(const void *a, const void *b) +{ + float fa = *((float*)a); + float fb = *((float*)b); + if ( fafb ) return -1; + return 0; +} bcf1_t *process_fmt(bcf1_t *rec) { bcf_unpack(rec, BCF_UN_FMT); @@ -679,33 +686,29 @@ bcf1_t *process_fmt(bcf1_t *rec) if ( rec->n_allele > 1 ) { pop_t *pop = &args->pop[i]; - memset(args->farr, 0, sizeof(*args->farr)*(rec->n_allele-1)); - for (j=1; jn_allele; j++) - args->farr[j-1] += pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; - an = pop->counts[0].nhet + pop->counts[0].nhom + pop->counts[0].nhemi + pop->counts[0].nac; - for (j=1; jn_allele; j++) an += args->farr[j-1]; + for (j=0; jn_allele; j++) + { + args->farr[j] = pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; + an += args->farr[j]; + } if ( an ) - for (j=1; jn_allele; j++) args->farr[j-1] /= an; + for (j=0; jn_allele; j++) args->farr[j] /= an; else - for (j=1; jn_allele; j++) bcf_float_set_missing(args->farr[j-1]); + for (j=0; jn_allele; j++) bcf_float_set_missing(args->farr[j]); } if ( args->tags & SET_AF ) { args->str.l = 0; ksprintf(&args->str, "AF%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) + if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr+1,rec->n_allele-1)!=0 ) error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); } - if ( args->tags & SET_MAF ) + if ( rec->n_allele > 1 && args->tags & SET_MAF ) { - if ( an ) - { - for (j=1; jn_allele; j++) - if ( args->farr[j-1] > 0.5 ) args->farr[j-1] = 1 - args->farr[j-1]; // todo: this is incorrect for multiallelic sites - } + if ( an ) qsort(args->farr,rec->n_allele,sizeof(float),cmpfloat_desc); args->str.l = 0; ksprintf(&args->str, "MAF%s", args->pop[i].suffix); - if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr,rec->n_allele-1)!=0 ) + if ( bcf_update_info_float(args->out_hdr,rec,args->str.s,args->farr+1,1)!=0 ) error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); } } @@ -717,13 +720,12 @@ bcf1_t *process_fmt(bcf1_t *rec) if ( rec->n_allele > 1 ) { pop_t *pop = &args->pop[i]; - memset(args->iarr, 0, sizeof(*args->iarr)*(rec->n_allele-1)); - for (j=1; jn_allele; j++) - args->iarr[j-1] += pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; + for (j=0; jn_allele; j++) + args->iarr[j] = pop->counts[j].nhet + pop->counts[j].nhom + pop->counts[j].nhemi + pop->counts[j].nac; } args->str.l = 0; ksprintf(&args->str, "AC%s", args->pop[i].suffix); - if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr,rec->n_allele-1)!=0 ) + if ( bcf_update_info_int32(args->out_hdr,rec,args->str.s,args->iarr+1,rec->n_allele-1)!=0 ) error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); } } diff --git a/test/fill-tags-AN0.out b/test/fill-tags-AN0.out index dd807114b..3c9df4975 100644 --- a/test/fill-tags-AN0.out +++ b/test/fill-tags-AN0.out @@ -10,7 +10,7 @@ ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##INFO= ##INFO= ##INFO= diff --git a/test/fill-tags-hemi.1.out b/test/fill-tags-hemi.1.out index ccf6550a3..c037bc1e6 100644 --- a/test/fill-tags-hemi.1.out +++ b/test/fill-tags-hemi.1.out @@ -10,11 +10,11 @@ ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B -1 3177144 . G T,A 45 PASS NS=2;AN=2;AF=0.5,0.5;MAF=0.5,0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 +1 3177144 . G T,A 45 PASS NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT 0/. 1/. 1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT ./0 ./1 1 3177144 . G T 45 PASS NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=1;HWE=1;ExcHet=1 GT ./. ./1 diff --git a/test/fill-tags-hemi.2.out b/test/fill-tags-hemi.2.out index 8f6fbfd72..4797a7451 100644 --- a/test/fill-tags-hemi.2.out +++ b/test/fill-tags-hemi.2.out @@ -10,11 +10,11 @@ ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B -1 3177144 . G T,A 45 PASS NS=2;AN=2;AF=0.5,0.5;MAF=0.5,0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 +1 3177144 . G T,A 45 PASS NS=2;AN=2;AF=0.5,0.5;MAF=0.5;AC=1,1;AC_Het=0,0;AC_Hom=0,0;AC_Hemi=1,1;HWE=1,1;ExcHet=1,1 GT 1 2 1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT 0/. 1/. 1 3177144 . G T 45 PASS NS=2;AN=2;AF=0.5;MAF=0.5;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT ./0 ./1 1 3177144 . G T 45 PASS NS=1;AN=1;AF=1;MAF=0;AC=1;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT ./. ./1 diff --git a/test/fill-tags-hwe.out b/test/fill-tags-hwe.out index ff2946dc2..6c0e990e8 100644 --- a/test/fill-tags-hwe.out +++ b/test/fill-tags-hwe.out @@ -10,11 +10,11 @@ ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 S5 S6 S7 S8 S9 S10 1 3177144 . G T 45 PASS NS=10;AN=20;AF=0;MAF=0;AC=0;AC_Het=0;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1 GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 1 3177144 . G T 45 PASS NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=2;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.947368 GT 1/0 1/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 1 3177144 . G T 45 PASS NS=10;AN=20;AF=0.1;MAF=0.1;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.0526316;ExcHet=1 GT 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 -1 3177144 . G T,C 45 PASS NS=10;AN=20;AF=0.5,0;MAF=0.5,0;AC=10,0;AC_Het=10,0;AC_Hom=0,0;AC_Hemi=0,0;HWE=0.00690641,1;ExcHet=0.00554244,1 GT 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 +1 3177144 . G T,C 45 PASS NS=10;AN=20;AF=0.5,0;MAF=0.5;AC=10,0;AC_Het=10,0;AC_Hom=0,0;AC_Hemi=0,0;HWE=0.00690641,1;ExcHet=0.00554244,1 GT 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 1/0 diff --git a/test/fill-tags.2.out b/test/fill-tags.2.out index 6af19a63f..3cb90c3ca 100644 --- a/test/fill-tags.2.out +++ b/test/fill-tags.2.out @@ -31,7 +31,7 @@ ##FILTER= ##INFO= ##INFO= -##INFO= +##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 11 2343543 . A . 999 PASS DP=100223;NS=3;AN=6 GT:PL:DP:GQ 0/0:0:193:99 0/0:0:211:99 0/0:0:182:99 11 5464562 . C T 999 PASS DP=0;NS=0;AN=0;AF=.;MAF=.;AC=0 GT:PL:DP:GQ ./.:0,0,0:.:. ./.:0,0,0:.:. ./.:0,0,0:.:. @@ -39,7 +39,7 @@ 20 126310 . ACC A 999 StrandBias;EndDistBias DP4=125718,95950,113812,80890;DP=461867;HWE=0.24036;ICF=0.01738;INDEL;IS=374,0.937343;MQ=49;PV4=9e-30,1,0,3.8e-13;QD=0.0172;AN=6;AC=4;NS=3;AF=0.666667;MAF=0.333333 GT:DP:GQ:PL 0/1:117:99:255,0,132 0/1:111:99:255,0,139 1/1:78:99:255,213,0 20 138125 rs2298108 G T 999 PASS DP4=174391,20849,82080,4950;DP=286107;Dels=0;FS=3200;HWE=0.199462;ICF=0.01858;MQ0=0;MQ=46;PV4=0,0,0,1;QD=17.22;AN=6;AC=4;NS=3;AF=0.666667;MAF=0.333333 GT:PL:DP:GQ 0/1:135,0,163:66:99 0/1:140,0,255:71:99 1/1:255,199,0:66:99 20 138148 rs2298109 C T 999 PASS DP4=194136,45753,94945,14367;DP=356657;Dels=0;FS=3200;HWE=0.177865;ICF=0.0198;MQ0=0;MQ=47;PV4=0,0,0,1;QD=14.57;AN=6;AC=4;NS=3;AF=0.666667;MAF=0.333333 GT:PL:DP:GQ 0/1:195,0,255:87:99 0/1:192,0,255:82:99 1/1:255,235,0:78:99 -20 271225 . T TTTA,TA 999 StrandBias DP4=29281,42401,27887,29245;DP=272732;INDEL;IS=95,0.748031;MQ=47;PV4=0,1,0,1;QD=0.0948;AN=6;AC=2,2;NS=3;AF=0.333333,0.333333;MAF=0.333333,0.333333 GT:DP:GQ:PL 0/2:33:49:151,53,203,0,52,159 0/1:51:99:255,0,213,255,255,255 1/2:47:99:255,255,255,255,0,241 +20 271225 . T TTTA,TA 999 StrandBias DP4=29281,42401,27887,29245;DP=272732;INDEL;IS=95,0.748031;MQ=47;PV4=0,1,0,1;QD=0.0948;AN=6;AC=2,2;NS=3;AF=0.333333,0.333333;MAF=0.333333 GT:DP:GQ:PL 0/2:33:49:151,53,203,0,52,159 0/1:51:99:255,0,213,255,255,255 1/2:47:99:255,255,255,255,0,241 20 304568 . C T 999 PASS DP4=16413,4543,945,156;DP=43557;Dels=0;FS=3200;HWE=0.076855;ICF=0.0213;MQ0=0;MQ=50;PV4=0,0,0,1;QD=15.45;AN=6;AC=4;NS=3;AF=0.666667;MAF=0.333333 GT:PL:DP:GQ 0|1:95,0,255:90:99 0|1:192,0,255:13:99 1|1:255,95,0:60:99 20 326891 . A AC 999 PASS DP4=125718,95950,113812,80890;DP=461867;HWE=0.24036;ICF=0.01738;INDEL;IS=374,0.937343;MQ=49;PV4=9e-30,1,0,3.8e-13;QD=0.0172;AN=4;AC=2;NS=2;AF=0.5;MAF=0.5 GT:DP:GQ:PL 0|1:117:99:255,0,132 0|1:111:99:255,0,139 ./.:.:.:.,.,. X 2928329 rs62584840 C T 999 PASS DP4=302,9137,32,1329;DP=11020;Dels=0;FS=13.38;HWE=0.284332;ICF=0.0253;MQ0=0;MQ=49;PV4=0.094,0,0,1;QD=18.61;AN=4;AC=1;NS=3;AF=0.25;MAF=0.25 GT:PL:DP:GQ 0:0,56:2:73 0:0,81:3:98 0/1:73,0,19:4:30 From feb1ffc19abb4d30e1cc86002f49b47d6c3377b1 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 22 Oct 2020 09:06:51 +0100 Subject: [PATCH 03/81] Append version and command line to the header unless --no-version is given --- plugins/split-vep.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/plugins/split-vep.c b/plugins/split-vep.c index b45938459..4b76cb20e 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -74,7 +74,7 @@ typedef struct { convert_t *convert; filter_t *filter; - int argc, filter_logic, regions_is_file, targets_is_file, list_hdr; + int argc, filter_logic, regions_is_file, targets_is_file, list_hdr, record_cmd_line; kstring_t kstr; char *filter_str, *vep_tag; // the --annotation INFO tag to process @@ -207,6 +207,7 @@ static const char *usage_text(void) "Common options:\n" " -e, --exclude EXPR Exclude sites and samples for which the expression is true\n" " -i, --include EXPR Include sites and samples for which the expression is true\n" + " --no-version Do not append version and command line to the header\n" " -o, --output FILE Output file name [stdout]\n" " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF or text, v: uncompressed VCF or text [v]\n" " -r, --regions REG Restrict to comma-separated list of regions\n" @@ -968,6 +969,7 @@ int run(int argc, char **argv) args->output_fname = "-"; args->output_type = FT_VCF; args->vep_tag = "CSQ"; + args->record_cmd_line = 1; static struct option loptions[] = { {"drop-sites",no_argument,0,'x'}, @@ -989,6 +991,7 @@ int run(int argc, char **argv) {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, + {"no-version",no_argument,NULL,2}, {NULL,0,NULL,0} }; int c; @@ -996,6 +999,7 @@ int run(int argc, char **argv) { switch (c) { + case 2 : args->record_cmd_line = 0; break; case 1 : args->column_types = optarg; break; case 'A': if ( !strcasecmp(optarg,"tab") ) args->all_fields_delim = "\t"; @@ -1063,6 +1067,7 @@ int run(int argc, char **argv) else { args->fh_vcf = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_split-vep"); if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); } while ( bcf_sr_next_line(args->sr) ) From 9c15769118cfc2e9c91813b563979eba7afc75c0 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 27 Oct 2020 10:53:02 +0000 Subject: [PATCH 04/81] When first base of the reference genome is deleted, the VCF record has POS=1 and the first REF base cannot precede the event. Fixes #1330 --- consensus.c | 10 ++++++++-- test/consensus.14.fa | 2 ++ test/consensus.14.out | 2 ++ test/consensus.14.vcf | 5 +++++ test/test.pl | 1 + 5 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 test/consensus.14.fa create mode 100644 test/consensus.14.out create mode 100644 test/consensus.14.vcf diff --git a/consensus.c b/consensus.c index c4082bd88..ed50c0c5c 100644 --- a/consensus.c +++ b/consensus.c @@ -580,7 +580,14 @@ static void apply_variant(args_t *args, bcf1_t *rec) int trim_beg = 0; int var_type = bcf_get_variant_type(rec,ialt); int var_len = rec->d.var[ialt].n; - if ( var_type & VCF_INDEL ) trim_beg = 1; + if ( var_type & VCF_INDEL ) + { + // normally indel starts one base after, but not if the first base of the fa reference is deleted + if ( rec->d.allele[0][0] == rec->d.allele[ialt][0] ) + trim_beg = 1; + else + trim_beg = 0; + } else if ( (var_type & VCF_OTHER) && !strcasecmp(rec->d.allele[ialt],"") ) { trim_beg = 1; @@ -698,7 +705,6 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( len_diff <= 0 ) { // deletion or same size event - assert( args->fa_buf.l >= idx+rec->rlen ); args->prev_base = args->fa_buf.s[idx+rec->rlen-1]; args->prev_base_pos = rec->pos + rec->rlen - 1; diff --git a/test/consensus.14.fa b/test/consensus.14.fa new file mode 100644 index 000000000..e17f74682 --- /dev/null +++ b/test/consensus.14.fa @@ -0,0 +1,2 @@ +>1 +GACT diff --git a/test/consensus.14.out b/test/consensus.14.out new file mode 100644 index 000000000..1ab73d61a --- /dev/null +++ b/test/consensus.14.out @@ -0,0 +1,2 @@ +>1 +ACT diff --git a/test/consensus.14.vcf b/test/consensus.14.vcf new file mode 100644 index 000000000..cad487932 --- /dev/null +++ b/test/consensus.14.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.2 +##reference=genome.fa +##contig= +#CHROM POS ID REF ALT QUAL FILTER . +1 1 . GA A . . . diff --git a/test/test.pl b/test/test.pl index 64b4cb2d5..858753786 100755 --- a/test/test.pl +++ b/test/test.pl @@ -602,6 +602,7 @@ test_vcf_consensus($opts,in=>'consensus.11',out=>'consensus.11.2.out',fa=>'consensus.11.fa',args=>q[-s smpl -a N]); test_vcf_consensus($opts,in=>'consensus.12',out=>'consensus.12.out',fa=>'consensus.12.fa',args=>''); test_vcf_consensus($opts,in=>'consensus.13',out=>'consensus.13.out',fa=>'consensus.13.fa',args=>''); +test_vcf_consensus($opts,in=>'consensus.14',out=>'consensus.14.out',fa=>'consensus.14.fa',args=>''); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite test_mpileup($opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite From 808ced9ae2bb7d0939ed701199fbf876af0d0634 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 3 Nov 2020 15:37:48 +0000 Subject: [PATCH 05/81] Upgrade AppVeyor image This fixes the MSYS2 signing keys and also upgrades pacman to support their .zst packages. See https://www.msys2.org/news/#2020-06-29-new-packagers --- .appveyor.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 3a9170dee..644776d19 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -22,6 +22,9 @@ skip_tags: true # - docs/* # - '**/*.html' +# Appveyor Windows images are based on Visual studio version +image: Visual Studio 2019 + # We use Mingw/Msys, so use pacman for installs install: - set HOME=. From 465a63c92413604d7c639f558cf1891f95af8f50 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 3 Nov 2020 16:12:41 +0000 Subject: [PATCH 06/81] Remove appveyor "only build develop" branch restriction So that it will test branches that will later become pull requests. --- .appveyor.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 644776d19..02554d74e 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -5,11 +5,6 @@ version: 'vers.{build}' # branches to build branches: - # Whitelist - only: - - develop - - # Blacklist except: - gh-pages From dc40b113d15ade803bc9a1b6431b63cb600401d4 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 4 Nov 2020 10:19:46 +0000 Subject: [PATCH 07/81] New -C option and `append-missing` logic The -C option allows to read a long list of options from a file to prevent impossibly long command lines. The `append-missing` logic allows annotations to be added for each ALT allele in the same order as they appear in the VCF. Note that this is not bullet proof. In order for this to work: - the annotation file must have one line per ALT allele - fields must contain a single value as multiple values are appended as they are and would break the correspondence between the alleles and values --- bcftools.h | 20 ++ doc/bcftools.txt | 12 +- filter.c | 21 -- test/annotate.missing-append.1.out | 13 + test/annotate.missing-append.tab | 13 + test/annotate.missing-append.vcf | 12 + test/annotate21.out | 2 +- test/test.pl | 1 + vcfannotate.c | 423 +++++++++++++++++++++-------- 9 files changed, 383 insertions(+), 134 deletions(-) create mode 100644 test/annotate.missing-append.1.out create mode 100644 test/annotate.missing-append.tab create mode 100644 test/annotate.missing-append.vcf diff --git a/bcftools.h b/bcftools.h index 96237eefb..07f1db35e 100644 --- a/bcftools.h +++ b/bcftools.h @@ -101,4 +101,24 @@ static inline double phred_score(double prob) return prob>99 ? 99 : prob; } +static const uint64_t bcf_double_missing = 0x7ff0000000000001; +static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; +static inline void bcf_double_set(double *ptr, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.i = value; + *ptr = u.d; +} +static inline int bcf_double_test(double d, uint64_t value) +{ + union { uint64_t i; double d; } u; + u.d = d; + return u.i==value ? 1 : 0; +} +#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end) +#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) +#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) +#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) +#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) + #endif diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 0944acc52..b26aa5bc7 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -341,6 +341,13 @@ Add or remove annotations. + See also the *-l, --merge-logic* option. +*-C, --columns-file* 'file':: + Read the list of columns from a file (normally given via the *-c, --columns* option). + "-" to skip a column of the annotation file. + One column name per row, an additional space- or tab-separated field can + be present to indicate the merge logic (normally given via the *-l, --merge-logic* option). + This is useful when many annotations are added at once. + *-e, --exclude* 'EXPRESSION':: exclude sites for which 'EXPRESSION' is true. For valid expressions see *<>*. @@ -373,10 +380,11 @@ Add or remove annotations. *-k, --keep-sites*:: keep sites which do not pass *-i* and *-e* expressions instead of discarding them -*-l, --merge-logic* 'tag':'first'|'append'|'unique'|'sum'|'avg'|'min'|'max'[,...]:: +*-l, --merge-logic* 'tag':'first'|'append'|'append-missing'|'unique'|'sum'|'avg'|'min'|'max'[,...]:: if multiple regions overlap a single record, the option defines how to treat multiple annotation values when setting 'tag' in the destination file: use the first encountered value ignoring - the rest ('first'); append allowing duplicates ('append'); append discarding duplicate values ('unique'); + the rest ('first'); append allowing duplicates ('append'); append even if the appended value is missing, + i.e. is a dot ('append-missing'); append discarding duplicate values ('unique'); sum the values ('sum', numeric fields only); average the values ('avg'); use the minimum value ('min') or the maximum ('max'). + diff --git a/filter.c b/filter.c index 8d8241310..54f17d322 100644 --- a/filter.c +++ b/filter.c @@ -57,27 +57,6 @@ static int filter_ninit = 0; # define __FUNCTION__ __func__ #endif -static const uint64_t bcf_double_missing = 0x7ff0000000000001; -static const uint64_t bcf_double_vector_end = 0x7ff0000000000002; -static inline void bcf_double_set(double *ptr, uint64_t value) -{ - union { uint64_t i; double d; } u; - u.i = value; - *ptr = u.d; -} -static inline int bcf_double_test(double d, uint64_t value) -{ - union { uint64_t i; double d; } u; - u.d = d; - return u.i==value ? 1 : 0; -} -#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end) -#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing) -#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end) -#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing) -#define bcf_double_is_missing_or_vector_end(x) (bcf_double_test((x),bcf_double_missing) || bcf_double_test((x),bcf_double_vector_end)) - - typedef struct _token_t { // read-only values, same for all VCF lines diff --git a/test/annotate.missing-append.1.out b/test/annotate.missing-append.1.out new file mode 100644 index 000000000..d5826998a --- /dev/null +++ b/test/annotate.missing-append.1.out @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 752566 . G A,*,C,T . . STR=GA,.,GC,G,.,T;INT=1,.,2,3,.,3;FLT=1.0,.,2.0,3.0,.,3.0 +1 752566 . G A,C . . STR=GA,GC;INT=1,2;FLT=1.0,2.0 +1 800000 . ACGTACGT TCGTACGT,ACGT . . STR=AT0,.;INT=0,.;FLT=0.0,. +1 800008 . C T . . . +1 800009 . C T . . . +1 800100 . ACGT ACC,TCGT,AC . . STR=.,AT,.;INT=.,1,.;FLT=.,1.0,. diff --git a/test/annotate.missing-append.tab b/test/annotate.missing-append.tab new file mode 100644 index 000000000..21c83a4ad --- /dev/null +++ b/test/annotate.missing-append.tab @@ -0,0 +1,13 @@ +1 752566 G * . . . +1 752566 G T G,.,T 3,.,3 3.0,.,3.0 +1 752566 G C GC 2 2.0 +1 752566 G A GA 1 1.0 +1 800000 A C AC0 0 0.0 +1 800000 A T AT0 0 0.0 +1 800000 A G AG0 0 0.0 +1 800004 A C AC4 4 4.0 +1 800004 A T AT4 4 4.0 +1 800004 A G AG4 4 4.0 +1 800100 A C AC 1 1.0 +1 800100 A T AT 1 1.0 +1 800100 A G AG 1 1.0 diff --git a/test/annotate.missing-append.vcf b/test/annotate.missing-append.vcf new file mode 100644 index 000000000..ac81ccf38 --- /dev/null +++ b/test/annotate.missing-append.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 752566 . G A,*,C,T . . . +1 752566 . G A,C . . . +1 800000 . ACGTACGT TCGTACGT,ACGT . . . +1 800008 . C T . . . +1 800009 . C T . . . +1 800100 . ACGT ACC,TCGT,AC . . . diff --git a/test/annotate21.out b/test/annotate21.out index cf8690119..cfaf1d3e2 100644 --- a/test/annotate21.out +++ b/test/annotate21.out @@ -5,7 +5,7 @@ ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO 1 1141506 . G A . PASS ABC=A -1 1141517 . C A . PASS ABC=B +1 1141517 . C A . PASS ABC=A 1 1141556 . G C . PASS ABC=B 1 1141570 . G C . PASS ABC=B 1 1141643 . T C . PASS . diff --git a/test/test.pl b/test/test.pl index 858753786..20da7fa3e 100755 --- a/test/test.pl +++ b/test/test.pl @@ -412,6 +412,7 @@ test_vcf_annotate($opts,in=>'annotate20.dst',vcf=>'annotate20.src',out=>'annotate20.2.out',args=>'-c +FMT/GT'); test_vcf_annotate($opts,in=>'annotate20.dst',vcf=>'annotate20.src',out=>'annotate20.3.out',args=>'-c -FMT/GT'); test_vcf_annotate($opts,in=>'annotate.multi',tab=>'annotate.multi',out=>'annotate.multi.1.out',args=>'-c CHROM,POS,REF,ALT,ANN -l ANN:append'); +test_vcf_annotate($opts,in=>'annotate.missing-append',tab=>'annotate.missing-append',out=>'annotate.missing-append.1.out',args=>'-c CHROM,POS,REF,ALT,STR,INT,FLT -l STR:append-missing,INT:append-missing,FLT:append-missing'); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+missing2ref --no-version'); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+setGT --no-version',args=>'-- -t . -n 0'); test_vcf_plugin($opts,in=>'setGT',out=>'setGT.1.out',cmd=>'+setGT --no-version',args=>'-- -t q -n 0 -i \'GT~"." && FMT/DP=30 && GQ=150\''); diff --git a/vcfannotate.c b/vcfannotate.c index 66a96526f..15a23bf35 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -78,10 +78,13 @@ annot_line_t; #define MM_AVG 4 #define MM_MIN 5 #define MM_MAX 6 +#define MM_APPEND_MISSING 7 // missing values will be transferred as well typedef struct _annot_col_t { int icol, replace, number; // number: one of BCF_VL_* types char *hdr_key_src, *hdr_key_dst; + // The setters return 0 on successful update of the bcf record, negative value (bcf_update_* return status) on errors, + // or 1 on (repeated partial updates) concluded with a src=NULL call int (*setter)(struct _args_t *, bcf1_t *dst, struct _annot_col_t *, void *src); // the last is the annotation line, either src bcf1_t or annot_line_t int (*getter)(struct _args_t *, bcf1_t *src, struct _annot_col_t *, void **ptr, int *mptr); int merge_method; // one of the MM_* defines @@ -94,7 +97,7 @@ typedef struct _annot_col_t double *mm_dbl; void *ptr; - int mptr; + int mptr, done; } annot_col_t; @@ -128,7 +131,9 @@ typedef struct _args_t vcmp_t *vcmp; // for matching annotation and VCF lines by allele annot_line_t *alines; // buffered annotation lines - int nalines, malines; + annot_line_t *aline_missing; + uint32_t *srt_alines; // sorted indexes (iALT<<16 || iAline) + int nalines, malines, nsrt_alines, msrt_alines; int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present annot_col_t *cols; // column indexes and setters int ncols; @@ -152,6 +157,7 @@ typedef struct _args_t char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; + int columns_is_file, has_append_mode; } args_t; @@ -496,23 +502,23 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat hts_expand(int,1,args->mtmpi,args->tmpi); args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]); if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]); - if ( col->replace==SET_OR_APPEND ) { bcf_add_filter(args->hdr_out,line,args->tmpi[0]); return 0; } + if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); if ( col->replace!=REPLACE_MISSING ) { bcf_update_filter(args->hdr_out,line,NULL,0); - bcf_update_filter(args->hdr_out,line,args->tmpi,1); - return 0; + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); } // only update missing FILTER if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); if ( !line->d.n_flt ) - bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return bcf_update_filter(args->hdr_out,line,args->tmpi,1); + return 0; } static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { - int i; + int i, ret = 0; bcf1_t *rec = (bcf1_t*) data; if ( !(rec->unpacked & BCF_UN_FLT) ) bcf_unpack(rec, BCF_UN_FLT); if ( !(line->unpacked & BCF_UN_FLT) ) bcf_unpack(line, BCF_UN_FLT); @@ -523,9 +529,9 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void for (i=0; id.n_flt; i++) { const char *flt = bcf_hdr_int2id(args->files->readers[1].header, BCF_DT_ID, rec->d.flt[i]); - bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)); + if ( bcf_add_filter(args->hdr_out,line,bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt)) < 0 ) ret = -1; } - return 0; + return ret; } hts_expand(int,rec->d.n_flt,args->mtmpi,args->tmpi); for (i=0; id.n_flt; i++) @@ -534,8 +540,7 @@ static int vcf_setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void args->tmpi[i] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, flt); } bcf_update_filter(args->hdr_out,line,NULL,0); - bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); - return 0; + return bcf_update_filter(args->hdr_out,line,args->tmpi,rec->d.n_flt); } static int setter_id(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -592,9 +597,9 @@ static int vcf_setter_ref(args_t *args, bcf1_t *line, annot_col_t *col, void *da als[0] = rec->d.allele[0]; int i; for (i=1; in_allele; i++) als[i] = line->d.allele[i]; - bcf_update_alleles(args->hdr_out, line, als, line->n_allele); + int ret = bcf_update_alleles(args->hdr_out, line, als, line->n_allele); free(als); - return 0; + return ret; } static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -608,9 +613,9 @@ static int vcf_setter_alt(args_t *args, bcf1_t *line, annot_col_t *col, void *da const char **als = (const char**) malloc(sizeof(char*)*rec->n_allele); als[0] = line->d.allele[0]; for (i=1; in_allele; i++) als[i] = rec->d.allele[i]; - bcf_update_alleles(args->hdr_out, line, als, rec->n_allele); + int ret = bcf_update_alleles(args->hdr_out, line, als, rec->n_allele); free(als); - return 0; + return ret; } static int setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -684,8 +689,7 @@ static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpi2[i] = args->tmpi[ map[i] ]; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst); } static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -697,25 +701,39 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( !tab ) { - if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) - error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Integer\n"); + if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && + col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && + col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND_MISSING ) + error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Integer\n"); } int i,ntmpi = 0; - if ( tab ) + if ( tab ) // has data, not flushing yet { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; while ( *end ) { - int val = strtol(str, &end, 10); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ntmpi++; hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); - args->tmpi[ntmpi-1] = val; - str = end+1; + if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) + { + if ( col->merge_method==MM_APPEND_MISSING ) + args->tmpi[ntmpi-1] = bcf_int32_missing; + else + ntmpi--; + if ( str[1]==0 ) end = str+1; + str += 2; + } + else + { + args->tmpi[ntmpi-1] = strtol(str, &end, 10); + if ( end==str ) + error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + str = end+1; + } } if ( col->merge_method!=MM_FIRST ) { @@ -728,7 +746,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } else { - if ( col->merge_method==MM_APPEND ) + if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { int nori = col->mm_dbl_nused; col->mm_dbl_nused += ntmpi; @@ -748,9 +766,10 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d } } col->mm_dbl_ndat++; + return 1; } } - else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) + else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { ntmpi = col->mm_dbl_nused; hts_expand(int32_t,ntmpi,args->mtmpi,args->tmpi); @@ -774,8 +793,7 @@ static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -792,8 +810,7 @@ static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0; } - bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); - return 0; + return bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi); } static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf) { @@ -824,8 +841,7 @@ static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int args->tmpf2[i] = args->tmpf[ map[i] ]; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst); } static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -837,25 +853,39 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( !tab ) { - if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && col->merge_method!=MM_APPEND ) - error("Error: at the moment only the sum,avg,min,max,append options are supported with --merge-logic for INFO type=Float\n"); + if ( col->merge_method!=MM_SUM && col->merge_method!=MM_AVG && + col->merge_method!=MM_MIN && col->merge_method!=MM_MAX && + col->merge_method!=MM_APPEND && + col->merge_method!=MM_APPEND_MISSING ) + error("Error: at the moment only the sum,avg,min,max,append,append-missing options are supported with --merge-logic for INFO type=Float\n"); } int i,ntmpf = 0; if ( tab ) { char *str = tab->cols[col->icol], *end = str; - if ( str[0]=='.' && str[1]==0 ) return 0; + if ( str[0]=='.' && str[1]==0 && col->merge_method!=MM_APPEND_MISSING ) return 1; while ( *end ) { - double val = strtod(str, &end); - if ( end==str ) - error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); ntmpf++; hts_expand(float,ntmpf,args->mtmpf,args->tmpf); - args->tmpf[ntmpf-1] = val; - str = end+1; + if ( str[0]=='.' && (str[1]==0 || str[1]==',') ) + { + if ( col->merge_method==MM_APPEND_MISSING ) + bcf_float_set_missing(args->tmpf[ntmpf-1]); + else + ntmpf--; + if ( str[1]==0 ) end = str+1; + str += 2; + } + else + { + args->tmpf[ntmpf-1] = strtod(str, &end); + if ( end==str ) + error("Could not parse %s at %s:%"PRId64" .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1,tab->cols[col->icol]); + str = end+1; + } } if ( col->merge_method!=MM_FIRST ) { @@ -864,17 +894,27 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * col->mm_dbl_nused = ntmpf; hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); for (i=0; imm_dbl[i] = args->tmpf[i]; + { + if ( bcf_float_is_missing(args->tmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i]); + else + col->mm_dbl[i] = args->tmpf[i]; + } } else { - if ( col->merge_method==MM_APPEND ) + if ( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { int nori = col->mm_dbl_nused; col->mm_dbl_nused += ntmpf; hts_expand(double,col->mm_dbl_nused,col->mm_dbl_nalloc,col->mm_dbl); for (i=0; imm_dbl[i+nori] = args->tmpf[i]; + { + if ( bcf_float_is_missing(args->tmpf[i]) ) + bcf_double_set_missing(col->mm_dbl[i+nori]); + else + col->mm_dbl[i+nori] = args->tmpf[i]; + } } else { @@ -888,13 +928,20 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * } } col->mm_dbl_ndat++; + return 1; } } - else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND ) + else if ( col->merge_method==MM_SUM || col->merge_method==MM_MIN || col->merge_method==MM_MAX || col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING ) { ntmpf = col->mm_dbl_nused; hts_expand(int32_t,ntmpf,args->mtmpf,args->tmpf); - for (i=0; itmpf[i] = col->mm_dbl[i]; + for (i=0; imm_dbl[i]) ) + bcf_float_set_missing(args->tmpf[i]); + else + args->tmpf[i] = col->mm_dbl[i]; + } col->mm_dbl_nused = col->mm_dbl_ndat = 0; } else if ( col->merge_method==MM_AVG ) @@ -914,8 +961,7 @@ static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void * if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); } static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -932,8 +978,7 @@ static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, vo if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0; } - bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); - return 0; + return bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf); } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als) @@ -990,8 +1035,7 @@ static int setter_ARinfo_string(args_t *args, bcf1_t *line, annot_col_t *col, in int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i); if ( ret!=0 ) error("[%s:%d %s] Failed to copy a string field\n", __FILE__,__LINE__,__func__); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s); } void khash_str2int_clear_free(void *_hash) { @@ -1021,7 +1065,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d { len = strlen(tab->cols[col->icol]); if ( !len ) return 0; - if ( len==1 && tab->cols[col->icol][0]=='.' ) return 0; + if ( len==1 && tab->cols[col->icol][0]=='.' && col->merge_method!=MM_APPEND_MISSING ) return 1; } if ( col->merge_method!=MM_FIRST ) @@ -1031,17 +1075,17 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d if ( data ) { - assert( col->merge_method==MM_APPEND || col->merge_method==MM_UNIQUE ); + assert( col->merge_method==MM_APPEND || col->merge_method==MM_APPEND_MISSING || col->merge_method==MM_UNIQUE ); if ( col->merge_method==MM_UNIQUE ) { if ( !col->mm_str_hash ) col->mm_str_hash = (khash_t(str2int)*)khash_str2int_init(); - if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 0; + if ( khash_str2int_has_key(col->mm_str_hash, tab->cols[col->icol]) ) return 1; khash_str2int_inc(col->mm_str_hash, strdup(tab->cols[col->icol])); } if ( col->mm_kstr.l ) kputc(',',&col->mm_kstr); kputs(tab->cols[col->icol], &col->mm_kstr); - return 0; + return 1; } if ( col->mm_kstr.l ) @@ -1052,12 +1096,10 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d else return 0; - if ( !data ) // flush the line - { - if ( col->merge_method==MM_UNIQUE ) - khash_str2int_clear_free(col->mm_str_hash); - col->mm_kstr.l = 0; - } + // flush the line + if ( col->merge_method==MM_UNIQUE ) + khash_str2int_clear_free(col->mm_str_hash); + col->mm_kstr.l = 0; } else { @@ -1069,8 +1111,7 @@ static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *d return setter_ARinfo_string(args,line,col,tab->nals,tab->als); } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); } static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) { @@ -1093,8 +1134,7 @@ static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, voi if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0; } - bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); - return 0; + return bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps); } static int genotypes_to_string(args_t *args, int nsrc1, int32_t *src, int nsmpl_dst, kstring_t *str) { @@ -1764,7 +1804,6 @@ static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, } } return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nsmpl_dst*ndst1); - } static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data) @@ -1970,11 +2009,45 @@ static void init_columns(args_t *args) int need_sample_map = 0; int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr); + kstring_t tmp = {0,0,0}; + if ( args->columns_is_file ) + { + int i,n; + char **str = hts_readlist(args->columns, args->columns_is_file, &n); + if ( !str ) error("Could not parse %s\n", args->columns); + for (i=0; imerge_method_str.l ) kputc(',',&args->merge_method_str); + kputs(str[i],&args->merge_method_str); + kputc(':',&args->merge_method_str); + kputs(ptr,&args->merge_method_str); + } + } + if ( tmp.l ) kputc(',',&tmp); + kputs(str[i],&tmp); + free(str[i]); + } + free(str); + free(args->columns); + args->columns = tmp.s; + tmp.l = tmp.m = 0; + tmp.s = NULL; + } + void *skip_fmt = NULL, *skip_info = NULL; if ( args->tgts_is_vcf ) args->columns = columns_complement(args->columns, &skip_info, &skip_fmt); - kstring_t str = {0,0,0}, tmp = {0,0,0}; + kstring_t str = {0,0,0}; char *ss = args->columns, *se = ss; args->ncols = 0; int icol = -1, has_fmt_str = 0; @@ -2393,6 +2466,11 @@ static void init_merge_method(args_t *args) int mm_type = MM_FIRST; if ( !strcasecmp("unique",mm_type_str) ) mm_type = MM_UNIQUE; else if ( !strcasecmp("append",mm_type_str) ) mm_type = MM_APPEND; + else if ( !strcasecmp("append-missing",mm_type_str) ) + { + mm_type = MM_APPEND_MISSING; + if ( args->ref_idx!=-1 ) args->has_append_mode = 1; + } else if ( !strcasecmp("sum",mm_type_str) ) mm_type = MM_SUM; else if ( !strcasecmp("avg",mm_type_str) ) mm_type = MM_AVG; else if ( !strcasecmp("min",mm_type_str) ) mm_type = MM_MIN; @@ -2401,7 +2479,7 @@ static void init_merge_method(args_t *args) for (i=0; incols; i++) { if ( strcmp(args->cols[i].hdr_key_dst,args->tmpks.s) ) continue; - if ( mm_type==MM_APPEND && args->cols[i].number!=BCF_VL_VAR ) + if ( (mm_type==MM_APPEND || mm_type==MM_APPEND_MISSING) && args->cols[i].number!=BCF_VL_VAR ) error("Error: --merge-logic append can be requested only for tags of variable length (Number=.)\n"); args->cols[i].merge_method = mm_type; break; @@ -2409,6 +2487,20 @@ static void init_merge_method(args_t *args) if ( i==args->ncols ) error("No such tag in the destination file: %s\n", args->tmpks.s); sb = *se ? se + 1 : se; } + if ( args->has_append_mode ) + { + // create a missing line to insert missing values when VCF ALT finds no match in the annotation file + args->aline_missing = (annot_line_t*)calloc(1,sizeof(*args->aline_missing)); + int ncol = 0; + for (i=0; incols; i++) + if ( ncol < args->cols[i].icol + 1 ) ncol = args->cols[i].icol + 1; + if ( ncol < args->ref_idx + 1 ) ncol = args->ref_idx + 1; + args->aline_missing->mcols = ncol; + args->aline_missing->ncols = ncol; + args->aline_missing->cols = (char**) malloc(ncol*sizeof(char*)); + for (i=0; ialine_missing->cols[i] = strdup("."); + } } static void rename_chrs(args_t *args, char *fname) @@ -2531,6 +2623,12 @@ static void destroy_data(args_t *args) free(args->cols[i].ptr); } free(args->cols); + if ( args->aline_missing ) + { + for (i=0; ialine_missing->ncols; i++) free(args->aline_missing->cols[i]); + free(args->aline_missing->cols); + free(args->aline_missing); + } for (i=0; imalines; i++) { free(args->alines[i].cols); @@ -2538,6 +2636,7 @@ static void destroy_data(args_t *args) free(args->alines[i].line.s); } free(args->alines); + free(args->srt_alines); if ( args->tgt_idx ) { regidx_destroy(args->tgt_idx); @@ -2627,7 +2726,6 @@ static void buffer_annot_lines(args_t *args, bcf1_t *line, int start_pos, int en } else i++; } - if ( args->ref_idx==-1 && args->nalines ) return; while ( !bcf_sr_regions_overlap(args->tgts, bcf_seqname(args->hdr,line), start_pos,end_pos) ) @@ -2657,6 +2755,7 @@ static void annotate(args_t *args, bcf1_t *line) int has_overlap = 0; if ( args->tgt_idx ) { + for (j=0; jncols; j++) args->cols[j].done = 0; if ( regidx_overlap(args->tgt_idx, bcf_seqname(args->hdr,line),line->pos,line->pos+line->rlen-1, args->tgt_itr) ) { while ( regitr_overlap(args->tgt_itr) ) @@ -2667,47 +2766,148 @@ static void annotate(args_t *args, bcf1_t *line) tmp->end = args->tgt_itr->end; parse_annot_line(args, regitr_payload(args->tgt_itr,char*), tmp); for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],tmp) ) + { + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],tmp); + if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } } has_overlap = 1; } for (j=0; jncols; j++) - if ( args->cols[j].merge_method != MM_FIRST ) - args->cols[j].setter(args,line,&args->cols[j],NULL); + { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + if ( args->cols[j].setter(args,line,&args->cols[j],NULL) < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } } else if ( args->tgts ) { - // Buffer annotation lines. When multiple ALT alleles are present in the - // annotation file, at least one must match one of the VCF alleles. + // Buffer annotation lines. When multiple ALT alleles are present in the annotation file, at least one + // must match some of the VCF alleles. If the append-missing mode is set (and REF+ALT is requested), the + // buffered lines will annotate the VCF respecting the order in ALT and when no matching line is found + // for an ALT, missing value is appended instead. int len = 0; bcf_get_variant_types(line); for (i=1; in_allele; i++) if ( len > line->d.var[i].n ) len = line->d.var[i].n; int end_pos = len<0 ? line->pos - len : line->pos; buffer_annot_lines(args, line, line->pos, end_pos); + + args->nsrt_alines = 0; + hts_expand(uint32_t,args->nalines,args->msrt_alines,args->srt_alines); + if ( args->nalines >= 0xffff || line->n_allele >= 0xffff ) + error("Error: too many alleles or annotation lines in the buffer at %s:%"PRId64" (todo:skip?)\n",bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + + // Find matching lines for (i=0; inalines; i++) { if ( line->pos > args->alines[i].end || end_pos < args->alines[i].start ) continue; - if ( args->ref_idx != -1 ) + if ( args->ref_idx != -1 ) // REF+ALT matching requested { - if ( vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs not compatible + if ( line->pos!=args->alines[i].start || vcmp_set_ref(args->vcmp, line->d.allele[0], args->alines[i].als[0]) < 0 ) continue; // refs are not compatible for (j=1; jalines[i].nals; j++) { - if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) break; // no ALT allele in VCF and annot file has "." - if ( vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]) >= 0 ) break; + int ialt; + if ( line->n_allele==1 && args->alines[i].als[j][0]=='.' && args->alines[i].als[j][1]==0 ) // match: no ALT allele in VCF and annot file has "." + ialt = 0; + else + { + ialt = vcmp_find_allele(args->vcmp, line->d.allele+1, line->n_allele - 1, args->alines[i].als[j]); + if ( ialt < 0 ) continue; + ialt++; + } + args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i; + has_overlap = 1; + break; + } + } + else // overlap, REF+ALT matching not requested + { + args->srt_alines[args->nsrt_alines++] = (0xffff<<16) | i; + has_overlap = 1; + } + } + // Sort lines if needed + if ( args->has_append_mode ) + { + // insertion sort by VCF ALT index (top bits) and alines index (low bits) + uint32_t tmp; + for (i=1; insrt_alines; i++) + for (j=i; j>0 && args->srt_alines[j] < args->srt_alines[j-1]; j--) + tmp = args->srt_alines[j], args->srt_alines[j] = args->srt_alines[j-1], args->srt_alines[j-1] = tmp; + } + // Annotate + for (j=0; jncols; j++) args->cols[j].done = 0; + int ialt_exp = 1; + for (i=0; insrt_alines; i++) + { + int ialt = args->srt_alines[i] >> 16; + int ilin = args->srt_alines[i] & 0xffff; + if ( args->has_append_mode ) + { + if ( ialt_exp > ialt ) continue; // multiple annotation lines for the same position + if ( ialt_exp < ialt ) + { + // REF+ALT matching requested, append-missing mode: insert "." if no annotation line was found for the ALT + while ( ialt_exp++ < ialt ) + { + for (j=0; jncols; j++) + { + if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); + if ( ret < 0 ) + error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + } } - if ( j==args->alines[i].nals ) continue; // none of the annot alleles present in VCF's ALT } - // there is a matching line - has_overlap = 1; for (j=0; jncols; j++) - if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) ) + { + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],&args->alines[ilin]); + if ( ret < 0 ) error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + ialt_exp = ialt + 1; + } + if ( args->nsrt_alines ) + { + // In the append-missing mode fill missing values to all trailing ALTs, but only if at least one + // record was found. Otherwise leave the row will be left without annotation. + if ( args->has_append_mode && ialt_exp < line->n_allele ) + { + while ( ialt_exp++ < line->n_allele ) + { + for (j=0; jncols; j++) + { + if ( args->cols[j].merge_method != MM_APPEND_MISSING ) continue; + if ( args->cols[j].done==1 ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],args->aline_missing); + if ( ret < 0 ) + error("fixme: Could not set missing %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + if ( ret==0 ) + args->cols[j].done = 1; + } + } + } + // Flush + for (j=0; jncols; j++) + { + if ( args->cols[j].done==1 || args->cols[j].merge_method == MM_FIRST ) continue; + int ret = args->cols[j].setter(args,line,&args->cols[j],NULL); + if ( ret < 0 ) + error("fixme: Could not set %s at %s:%"PRId64"\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),(int64_t) line->pos+1); + } } - for (j=0; jncols; j++) - if ( args->cols[j].merge_method != MM_FIRST ) - args->cols[j].setter(args,line,&args->cols[j],NULL); } else if ( args->files->nreaders == 2 ) { @@ -2752,28 +2952,29 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools annotate [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -a, --annotations VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n"); - fprintf(stderr, " --collapse matching records by , see man page for details [some]\n"); - fprintf(stderr, " -c, --columns list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n"); - fprintf(stderr, " -h, --header-lines lines which should be appended to the VCF header\n"); - fprintf(stderr, " -I, --set-id [+] set ID column, see man page for details\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); - fprintf(stderr, " -l, --merge-logic merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); - fprintf(stderr, " -m, --mark-sites [+-] add INFO/tag flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " --rename-chrs rename sequences according to map file: from\\tto\n"); - fprintf(stderr, " -s, --samples [^] comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " -S, --samples-file [^] file of samples to annotate (or exclude with \"^\" prefix)\n"); - fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); - fprintf(stderr, " -x, --remove list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); - fprintf(stderr, " --threads number of extra output compression threads [0]\n"); + fprintf(stderr, " -a, --annotations FILE VCF file or tabix-indexed FILE with annotations: CHR\\tPOS[\\tVALUE]+\n"); + fprintf(stderr, " --collapse STR matching records by , see man page for details [some]\n"); + fprintf(stderr, " -c, --columns LIST list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n"); + fprintf(stderr, " -C, --columns-file FILE read -c columns from FILE, one name per row, with optional --merge-logic TYPE: NAME[ TYPE]\n"); + fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " --force continue despite parsing error (at your own risk!)\n"); + fprintf(stderr, " -h, --header-lines FILE lines which should be appended to the VCF header\n"); + fprintf(stderr, " -I, --set-id [+]FORMAT set ID column using a `bcftools query`-like expression, see man page for details\n"); + fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -k, --keep-sites leave -i/-e sites unchanged instead of discarding them\n"); + fprintf(stderr, " -l, --merge-logic TAG:TYPE merge logic for multiple overlapping regions (see man page for details), EXPERIMENTAL\n"); + fprintf(stderr, " -m, --mark-sites [+-]TAG add INFO/TAG flag to sites which are (\"+\") or are not (\"-\") listed in the -a file\n"); + fprintf(stderr, " --no-version do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); + fprintf(stderr, " -r, --regions REGION restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n"); + fprintf(stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n"); + fprintf(stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " --single-overlaps keep memory low by avoiding complexities arising from handling multiple overlapping intervals\n"); + fprintf(stderr, " -x, --remove LIST list of annotations (e.g. ID,INFO/DP,FORMAT/DP,FILTER) to remove (or keep with \"^\" prefix). See man page for details\n"); + fprintf(stderr, " --threads INT number of extra output compression threads [0]\n"); fprintf(stderr, "\n"); exit(1); } @@ -2808,6 +3009,7 @@ int main_vcfannotate(int argc, char *argv[]) {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"remove",required_argument,NULL,'x'}, + {"columns-file",required_argument,NULL,'C'}, {"columns",required_argument,NULL,'c'}, {"rename-chrs",required_argument,NULL,1}, {"header-lines",required_argument,NULL,'h'}, @@ -2818,7 +3020,7 @@ int main_vcfannotate(int argc, char *argv[]) {"force",no_argument,NULL,'f'}, {NULL,0,NULL,0} }; - while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h:?o:O:r:R:a:x:c:C:i:e:S:s:I:m:kl:f",loptions,NULL)) >= 0) { switch (c) { case 'f': args->force = 1; break; @@ -2837,6 +3039,7 @@ int main_vcfannotate(int argc, char *argv[]) case 's': args->sample_names = optarg; break; case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; case 'c': args->columns = strdup(optarg); break; + case 'C': args->columns = strdup(optarg); args->columns_is_file = 1; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { From a2dc76ebc18387e7b95116691f5421d41cc44328 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 5 Nov 2020 14:29:03 +0000 Subject: [PATCH 08/81] Add Cirrus-CI integration --- .cirrus.yml | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 .cirrus.yml diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 000000000..3e0d8e27f --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,159 @@ +# Note we have a maximum of 16 CPUs available, so adjust our +# builds so we can start all concurrently without needing to schedule. + +# Sadly though there is still a finite limit to macOS of one instance. +# Can we cull our Mac test to just one instance? + +timeout_in: 10m + +#-------------------------------------------------- +# Template: htslib clone & build +# +# We try to clone htslib using the same branch name and owner as this +# bcftools branch. If it exists, it's likely the user is making a +# joint bcftools+htslib PR and wants both checked in unison. +# Failing that we use samtools/htslib:develop. +# Note this only works on the users own forks. Once in the samtools +# organisation the branch name becomes pull/. + +# Logic for choosing which to use is in the .travis/clone script. +# Note we could also use "clone_script" if we want to replace the bcftools +# clone with our own commands too. +clone_template: &HTSLIB_CLONE + htslib_clone_script: | + .travis/clone "git://github.com/${CIRRUS_REPO_OWNER}/htslib" "${HTSDIR}" "${CIRRUS_BRANCH}" + + +#-------------------------------------------------- +# Template: bcftools compile and test + +compile_template: &COMPILE + << : *HTSLIB_CLONE + + compile_script: | + if test "$USE_CONFIG" = "yes"; then + (cd $HTSDIR && autoreconf) + autoreconf + ./configure || (cat config.log; /bin/false) + make -j3 + else + make -j3 plugindir=$CIRRUS_WORKING_DIR/plugins -e + fi + +test_template: &TEST + test_script: | + make -e test + + +#-------------------------------------------------- +# Task: linux builds. + +# Debian + latest GCC +gcc_task: + name: debian-gcc + container: + image: gcc:latest + cpu: 1 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + HTSDIR: ./htslib + + matrix: + - environment: + USE_CONFIG: no + - environment: + USE_CONFIG: yes + CFLAGS: -std=gnu99 -O0 + + << : *COMPILE + << : *TEST + + +# Ubuntu + Clang +ubuntu_task: + name: ubuntu-clang + container: + image: ubuntu:devel + cpu: 2 + memory: 1G + + environment: + CC: clang + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + HTSDIR: ./htslib + + matrix: + - environment: + USE_CONFIG: no + - environment: + USE_CONFIG: yes + CFLAGS: -g -Wall -O3 -fsanitize=address + LDFLAGS: -fsanitize=address -Wl,-rpath,`pwd`/inst/lib + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates clang git autoconf automake \ + make zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev \ + libssl-dev libdeflate-dev libncurses5-dev + + << : *COMPILE + << : *TEST + + +# CentOS +centos_task: + name: centos-gcc + container: + image: centos:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + HTSDIR: ./htslib + USE_CONFIG: yes + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \ + bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \ + git diffutils + + << : *COMPILE + << : *TEST + + +#-------------------------------------------------- +# Task: macOS builds + +macosx_task: + name: macosx + clang + osx_instance: + image: catalina-base + + environment: + CC: clang + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + HTSDIR: ./htslib + + matrix: + - environment: + USE_CONFIG: no + - environment: + USE_CONFIG: yes + + package_install_script: + - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz + + << : *COMPILE + << : *TEST From 761021b4ebe30c3480bcf0dea0e5103b948d513d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 5 Nov 2020 14:54:45 +0000 Subject: [PATCH 09/81] Update .travis/clone to match Samtools. The change is if we checkout OWNER/bcftools:BRANCH and OWNER/htslib:BRANCH is absent, then checkout SAMTOOLS/htslib:BRANCH instead of OWNER/htslib:BRANCH. The original method works, but only if the owner is keeping their own htslib develop up to date instead of using their fork purely for making PRs. --- .travis/clone | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis/clone b/.travis/clone index d97840207..f4b823c23 100755 --- a/.travis/clone +++ b/.travis/clone @@ -2,13 +2,16 @@ # Usage: .travis/clone REPOSITORY [DIR] [BRANCH] # # Creates a shallow clone, checking out the specified branch. If BRANCH is -# omitted or if there is no branch with that name, checks out origin/HEAD. +# omitted or if there is no branch with that name, checks out origin/HEAD +# from the samtools/htslib repository. repository=$1 localdir=$2 branch=$3 -[ -n "$branch" ] && ref=$(git ls-remote --heads $repository $branch) +ref='' +[ -n "$branch" ] && ref=$(git ls-remote --heads "$repository" "$branch" 2>/dev/null) +[ -z "$ref" ] && repository='git://github.com/samtools/htslib.git' set -x -git clone --depth=1 ${ref:+--branch=$branch} $repository $localdir +git clone --depth=1 ${ref:+--branch="$branch"} "$repository" "$localdir" From eec09685edb6baf98e5c2011e4c6cfb0cec3e298 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 10 Nov 2020 15:44:33 +0000 Subject: [PATCH 10/81] Clarify how --n-matches works, allow to choose all samples 1. --n-matches is sorted by average discordance score, i.e. divided by the number of compared sites, not by the total score 2. Add support for matching a single sample against all other samples in the file by giving -s qry:sample -s gt:- This was not possible before, either full cross-check mode had to be run or a list of pairs/samples had to be created explicitly --- test/gtcheck.ntop.1.out | 6 ++++ test/gtcheck.ntop.2.out | 2 ++ test/gtcheck.ntop.gts.vcf | 11 ++++++ test/gtcheck.ntop.vcf | 11 ++++++ vcfgtcheck.c | 75 +++++++++++++++++++-------------------- 5 files changed, 67 insertions(+), 38 deletions(-) create mode 100644 test/gtcheck.ntop.1.out create mode 100644 test/gtcheck.ntop.2.out create mode 100644 test/gtcheck.ntop.gts.vcf create mode 100644 test/gtcheck.ntop.vcf diff --git a/test/gtcheck.ntop.1.out b/test/gtcheck.ntop.1.out new file mode 100644 index 000000000..2092aa6b6 --- /dev/null +++ b/test/gtcheck.ntop.1.out @@ -0,0 +1,6 @@ +DC smpl x1 4.951814e+01 2.197225e+00 6 +DC smpl x2 9.904588e+00 1.075056e+01 6 +DC smpl x3 1.000050e-03 1.075056e+01 5 +DC smpl x4 2.971136e+01 7.613325e+00 6 +DC smpl x5 1.200060e-03 1.258314e+01 6 +DC smpl x6 3.961475e+01 5.416100e+00 6 diff --git a/test/gtcheck.ntop.2.out b/test/gtcheck.ntop.2.out new file mode 100644 index 000000000..43843ea7c --- /dev/null +++ b/test/gtcheck.ntop.2.out @@ -0,0 +1,2 @@ +DC smpl x5 1.200060e-03 1.258314e+01 6 +DC smpl x3 1.000050e-03 1.075056e+01 5 diff --git a/test/gtcheck.ntop.gts.vcf b/test/gtcheck.ntop.gts.vcf new file mode 100644 index 000000000..da84ea7b7 --- /dev/null +++ b/test/gtcheck.ntop.gts.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##reference=ref.fa +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT x1 x2 x3 x4 x5 x6 +1 10 . C A . . . GT 0/0 0/0 ./. 1/1 1/1 1/1 +1 11 . C A . . . GT 0/0 1/1 1/1 1/1 1/1 1/1 +1 12 . C A . . . GT 0/0 1/1 1/1 1/1 1/1 0/0 +1 13 . C A . . . GT 0/0 1/1 1/1 0/0 1/1 0/0 +1 14 . C A . . . GT 0/0 1/1 1/1 0/0 1/1 0/0 +1 15 . C A . . . GT 1/1 1/1 1/1 0/0 1/1 0/0 diff --git a/test/gtcheck.ntop.vcf b/test/gtcheck.ntop.vcf new file mode 100644 index 000000000..7937fc74b --- /dev/null +++ b/test/gtcheck.ntop.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##reference=ref.fa +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl +1 10 . C A . . . GT 1/1 +1 11 . C A . . . GT 1/1 +1 12 . C A . . . GT 1/1 +1 13 . C A . . . GT 1/1 +1 14 . C A . . . GT 1/1 +1 15 . C A . . . GT 1/1 diff --git a/vcfgtcheck.c b/vcfgtcheck.c index 19c139d42..1f0657d3e 100644 --- a/vcfgtcheck.c +++ b/vcfgtcheck.c @@ -213,6 +213,35 @@ static inline int diff_sites_shift(args_t *args, int *ndiff, int *rid, int *pos) return 1; } +static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, bcf_hdr_t *hdr, char *vcf_fname) +{ + int i; + if ( !strcmp(list,"-") ) + { + *nsmpl = bcf_hdr_nsamples(hdr); + *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl)); + for (i=0; i<*nsmpl; i++) (*smpl)[i] = i; + return; + } + + char **tmp = hts_readlist(list, list_is_file, nsmpl); + if ( !tmp || !*nsmpl ) error("Failed to parse %s\n", list); + *smpl = (int*) malloc(sizeof(**smpl)*(*nsmpl)); + for (i=0; i<*nsmpl; i++) + { + int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, tmp[i]); + if ( idx<0 ) error("No such sample in %s: [%s]\n",vcf_fname,tmp[i]); + (*smpl)[i] = idx; + free(tmp[i]); + } + free(tmp); + qsort(*smpl,*nsmpl,sizeof(**smpl),cmp_int); + // check for duplicates + for (i=1; i<*nsmpl; i++) + if ( (*smpl)[i-1]==(*smpl)[i] ) + error("Error: the sample \"%s\" is listed twice in %s\n", hdr->samples[(*smpl)[i]],list); +} + static void init_data(args_t *args) { args->files = bcf_sr_init(); @@ -277,43 +306,13 @@ static void init_data(args_t *args) args->nqry_smpl = bcf_hdr_nsamples(args->qry_hdr); if ( args->qry_samples ) { - char **tmp = hts_readlist(args->qry_samples, args->qry_samples_is_file, &args->nqry_smpl); - if ( !tmp || !args->nqry_smpl ) error("Failed to parse %s\n", args->qry_samples); - args->qry_smpl = (int*) malloc(sizeof(*args->qry_smpl)*args->nqry_smpl); - for (i=0; inqry_smpl; i++) - { - int idx = bcf_hdr_id2int(args->qry_hdr, BCF_DT_SAMPLE, tmp[i]); - if ( idx<0 ) error("No such sample in %s: [%s]\n",args->qry_fname,tmp[i]); - args->qry_smpl[i] = idx; - free(tmp[i]); - } - free(tmp); - qsort(args->qry_smpl,args->nqry_smpl,sizeof(*args->qry_smpl),cmp_int); - // check for duplicates - for (i=1; inqry_smpl; i++) - if ( args->qry_smpl[i-1]==args->qry_smpl[i] ) - error("Error: the sample \"%s\" is listed twice in %s\n",args->qry_hdr->samples[args->qry_smpl[i]],args->qry_fname); + init_samples(args->qry_samples, args->qry_samples_is_file, &args->qry_smpl, &args->nqry_smpl, args->qry_hdr, args->qry_fname); } if ( args->gt_samples ) - { - char **tmp = hts_readlist(args->gt_samples, args->gt_samples_is_file, &args->ngt_smpl); - if ( !tmp || !args->ngt_smpl ) error("Failed to parse %s\n", args->gt_samples); - args->gt_smpl = (int*) malloc(sizeof(*args->gt_smpl)*args->ngt_smpl); - for (i=0; ingt_smpl; i++) - { - int idx = bcf_hdr_id2int(args->gt_hdr ? args->gt_hdr : args->qry_hdr, BCF_DT_SAMPLE, tmp[i]); - if ( idx<0 ) error("No such sample in %s: [%s]\n",args->gt_fname ? args->gt_fname : args->qry_fname,tmp[i]); - args->gt_smpl[i] = idx; - free(tmp[i]); - } - free(tmp); - qsort(args->gt_smpl,args->ngt_smpl,sizeof(*args->gt_smpl),cmp_int); - // check for duplicates - for (i=1; ingt_smpl; i++) - if ( args->gt_smpl[i-1]==args->gt_smpl[i] ) - error("Error: the sample \"%s\" is listed twice in %s\n", - args->qry_hdr ? args->qry_hdr->samples[args->gt_smpl[i]] : args->qry_hdr->samples[args->gt_smpl[i]], - args->gt_fname ? args->gt_fname : args->qry_fname); + { + init_samples(args->gt_samples, args->gt_samples_is_file, &args->gt_smpl, &args->ngt_smpl, + args->gt_hdr ? args->gt_hdr : args->qry_hdr, + args->gt_fname ? args->gt_fname : args->qry_fname); } else if ( args->pair_samples ) { @@ -959,14 +958,14 @@ static void usage(void) fprintf(stderr, " -e, --error-probability INT Phred-scaled probability of genotyping error, 0 for faster but less accurate results [40]\n"); fprintf(stderr, " -g, --genotypes FILE Genotypes to compare against\n"); fprintf(stderr, " -H, --homs-only Homozygous genotypes only, useful with low coverage data (requires -g)\n"); - fprintf(stderr, " --n-matches INT Print only top INT matches for each sample, 0 for unlimited. Use negative value\n"); - fprintf(stderr, " to sort by HWE probability rather than the number of discordant sites [0]\n"); + fprintf(stderr, " --n-matches INT Print only top INT matches for each sample (sorted by average score), 0 for unlimited.\n"); + fprintf(stderr, " Use negative value to sort by HWE probability rather than by discordance [0]\n"); fprintf(stderr, " --no-HWE-prob Disable calculation of HWE probability\n"); fprintf(stderr, " -p, --pairs LIST Comma-separated sample pairs to compare (qry,gt[,qry,gt..] with -g or qry,qry[,qry,qry..] w/o)\n"); fprintf(stderr, " -P, --pairs-file FILE File with tab-delimited sample pairs to compare (qry,gt with -g or qry,qry w/o)\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples (by default all samples are compared)\n"); + fprintf(stderr, " -s, --samples [qry|gt]:LIST List of query or -g samples, \"-\" to select all samples (by default all samples are compared)\n"); fprintf(stderr, " -S, --samples-file [qry|gt]:FILE File with the query or -g samples to compare\n"); fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); From 3dd8b82584be49432edeeebe3a97dc3de8036e78 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 12 Nov 2020 10:59:17 +0000 Subject: [PATCH 11/81] Make F_MISSING work for sites with many ALT alleles. Fixes #1343 --- filter.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/filter.c b/filter.c index 54f17d322..fc89c129a 100644 --- a/filter.c +++ b/filter.c @@ -1063,15 +1063,24 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n"); int j,nmissing = 0; - for (i=0; in_sample; i++) - { - int8_t *ptr = (int8_t*) (fmt->p + i*fmt->size); - for (j=0; jn; j++) - { - if ( ptr[j]==bcf_int8_vector_end ) break; - if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } - } + #define BRANCH(type_t, is_vector_end) { \ + for (i=0; in_sample; i++) \ + { \ + type_t *ptr = (type_t *) (fmt->p + i*fmt->size); \ + for (j=0; jn; j++) \ + { \ + if ( ptr[j]==is_vector_end ) break; \ + if ( ptr[j]==bcf_gt_missing ) { nmissing++; break; } \ + } \ + } \ } + switch (fmt->type) { + case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; + default: fprintf(stderr,"todo: type %d\n", fmt->type); exit(1); break; + } + #undef BRANCH tok->nvalues = 1; tok->values[0] = tok->tag[0]=='N' ? nmissing : (double)nmissing / line->n_sample; } From c454e5fcfc0700397f285464b8653e0ec0e50137 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 18 Nov 2020 10:08:37 +0000 Subject: [PATCH 12/81] Do not phase genotypes that are not phased with `concat -l`. Fixes #1346 --- test/concat.4.a.vcf | 6 ++++++ test/concat.4.b.vcf | 7 +++++++ test/concat.5.out | 10 ++++++++++ test/test.pl | 2 ++ vcfconcat.c | 1 + 5 files changed, 26 insertions(+) create mode 100644 test/concat.4.a.vcf create mode 100644 test/concat.4.b.vcf create mode 100644 test/concat.5.out diff --git a/test/concat.4.a.vcf b/test/concat.4.a.vcf new file mode 100644 index 000000000..70131980d --- /dev/null +++ b/test/concat.4.a.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE +chr1 1 . A C . . . GT 0|1 +chr1 2 . C G . . . GT 0/1 diff --git a/test/concat.4.b.vcf b/test/concat.4.b.vcf new file mode 100644 index 000000000..36bb1c726 --- /dev/null +++ b/test/concat.4.b.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE +chr1 1 . A C . . . GT 1|0 +chr1 2 . C G . . . GT 0/1 +chr1 3 . G T . . . GT 0|1 diff --git a/test/concat.5.out b/test/concat.5.out new file mode 100644 index 000000000..7fb5eb38f --- /dev/null +++ b/test/concat.5.out @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE +chr1 1 . A C . . . GT:PS 0|1:1 +chr1 2 . C G . . . GT:PQ:PS 0/1:99:1 +chr1 3 . G T . . . GT:PS 1|0:1 diff --git a/test/test.pl b/test/test.pl index 20da7fa3e..9b02f9b57 100755 --- a/test/test.pl +++ b/test/test.pl @@ -538,6 +538,8 @@ test_vcf_concat($opts,in=>['concat.3.a','concat.3.b','concat.3.0','concat.3.c','concat.3.d','concat.3.e','concat.3.f'],out=>'concat.3.vcf.out',do_bcf=>0,args=>'-l'); test_vcf_concat($opts,in=>['concat.3.a','concat.3.b','concat.3.0','concat.3.c','concat.3.d','concat.3.e','concat.3.f'],out=>'concat.3.bcf.out',do_bcf=>1,args=>'-l'); test_naive_concat($opts,name=>'naive_concat',max_hdr_lines=>10000,max_body_lines=>10000,nfiles=>10); +test_vcf_concat($opts,in=>['concat.4.a','concat.4.b'],out=>'concat.5.out',do_bcf=>0,args=>'-l'); +test_vcf_concat($opts,in=>['concat.4.a','concat.4.b'],out=>'concat.5.out',do_bcf=>1,args=>'-l'); test_vcf_reheader($opts,in=>'reheader',out=>'reheader.1.out',header=>'reheader.hdr'); test_vcf_reheader($opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples'); test_vcf_reheader($opts,in=>'reheader',out=>'reheader.2.out',samples=>'reheader.samples2'); diff --git a/vcfconcat.c b/vcfconcat.c index a3007ffe7..4bdfb51d3 100644 --- a/vcfconcat.c +++ b/vcfconcat.c @@ -234,6 +234,7 @@ static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec) if ( !args->swap_phase[i] ) continue; int *gt = &args->GTa[i*2]; if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue; + if ( !bcf_gt_is_phased(gt[1]) ) continue; SWAP(int, gt[0], gt[1]); gt[1] |= 1; } From a923acc5ad212926bac38b2c57517dba816a653f Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 19 Nov 2020 09:40:36 +0000 Subject: [PATCH 13/81] Make an assumption about start when strlen(REF)!=1. Resolves #1319 --- consensus.c | 14 +++++++++++--- test/consensus.13.out | 2 +- test/consensus.13.vcf | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/consensus.c b/consensus.c index ed50c0c5c..48d0e015f 100644 --- a/consensus.c +++ b/consensus.c @@ -642,12 +642,20 @@ static void apply_variant(args_t *args, bcf1_t *rec) error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n" "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n", rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - assert( rec->d.allele[0][1]==0 ); // todo: for now expecting strlen(REF) = 1 if ( !strcasecmp(rec->d.allele[ialt],"") ) { + static int multibase_ref_del_warned = 0; + if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned ) + { + fprintf(stderr, + "Warning: one REF base is expected with , assuming the actual deletion starts at POS+1 at %s:%"PRId64".\n" + " (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + multibase_ref_del_warned = 1; + } + len_diff = 1-rec->rlen; - rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, REF must precede the event - alen = strlen(rec->d.allele[ialt]); + rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event + alen = 1; } else { diff --git a/test/consensus.13.out b/test/consensus.13.out index 016c99e34..aacf66c60 100644 --- a/test/consensus.13.out +++ b/test/consensus.13.out @@ -1,2 +1,2 @@ >2 -ACGTACGTAAGTACGTACGTACGTACGTACGT +ACGTACGTAAGTGTACGTACGTACGTACGT diff --git a/test/consensus.13.vcf b/test/consensus.13.vcf index ef31b351d..ed6c6fbf9 100644 --- a/test/consensus.13.vcf +++ b/test/consensus.13.vcf @@ -8,3 +8,4 @@ 2 10 . C A . . . GT 0/1 2 10 . C . . END=12 GT 0/1 2 12 . T . . END=14 GT 0/1 +2 16 . TAC . . . GT 0/1 From ec5eb612ed020bc5d1ae98dcc487ba306259e0b6 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 19 Nov 2020 11:18:51 +0000 Subject: [PATCH 14/81] Add new `annotate --rename-annots` option to help fix broken VCFs. Resolves #1335 --- doc/bcftools.txt | 6 ++++++ test/annotate21.txt | 11 +++++++++++ test/annotate21.vcf | 21 +++++++++++++++++++++ test/annotate29.out | 21 +++++++++++++++++++++ test/test.pl | 4 +++- vcfannotate.c | 42 +++++++++++++++++++++++++++++++++++++++++- 6 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 test/annotate21.txt create mode 100644 test/annotate21.vcf create mode 100644 test/annotate29.out diff --git a/doc/bcftools.txt b/doc/bcftools.txt index b26aa5bc7..a0a0d3ac2 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -413,6 +413,12 @@ Add or remove annotations. *-R, --regions-file* 'file':: see *<>* +*--rename-annots* 'file':: + rename annotations according to the map in 'file', with + "old_name new_name\n" pairs separated by whitespaces, each on a separate + line. The old name must be prefixed with the annotation type: + INFO, FORMAT, or FILTER. + *--rename-chrs* 'file':: rename chromosomes according to the map in 'file', with "old_name new_name\n" pairs separated by whitespaces, each on a separate diff --git a/test/annotate21.txt b/test/annotate21.txt new file mode 100644 index 000000000..9efc6d12b --- /dev/null +++ b/test/annotate21.txt @@ -0,0 +1,11 @@ +FORMAT/X-X XX +FORMAT/Y-Y YY +FORMAT/A-A AA +INFO/A-A AA +INFO/B-B BB +INFO/X-X XX +INFO/Y-Y YY +FILTER/flt-A fltA +FILTER/flt-B fltB +FILTER/flt-X fltX +FILTER/flt-Y fltY diff --git a/test/annotate21.vcf b/test/annotate21.vcf new file mode 100644 index 000000000..3369a4426 --- /dev/null +++ b/test/annotate21.vcf @@ -0,0 +1,21 @@ +##fileformat=VCFv4.1 +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 3000000 id C . 20 . A-A=1;B-B=2;X-X=3;Y-Y=4 GT:X-X:PL:Y-Y:A-A 0/1:1:2:3:1 0/1:1:2:3:1 +1 3000001 id C . 20 PASS A-A=1;B-B=2;X-X=3;Y-Y=4 GT:X-X:PL:Y-Y:A-A 0/1:1:2:3:1 0/1:1:2:3:1 +1 3000002 id C . 20 flt-Y;flt-A;flt-B;flt-X B-B=2;X-X=3;Y-Y=4;A-A=1 GT:Y-Y:X-X:PL:A-A 0/1:3:1:2:1 0/1:3:1:2:1 diff --git a/test/annotate29.out b/test/annotate29.out new file mode 100644 index 000000000..240e655c7 --- /dev/null +++ b/test/annotate29.out @@ -0,0 +1,21 @@ +##fileformat=VCFv4.1 +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##reference=file:///lustre/scratch105/projects/g1k/ref/main_project/human_g1k_v37.fasta +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 3000000 id C . 20 . AA=1;BB=2;XX=3;YY=4 GT:XX:PL:YY:AA 0/1:1:2:3:1 0/1:1:2:3:1 +1 3000001 id C . 20 PASS AA=1;BB=2;XX=3;YY=4 GT:XX:PL:YY:AA 0/1:1:2:3:1 0/1:1:2:3:1 +1 3000002 id C . 20 fltY;fltA;fltB;fltX BB=2;XX=3;YY=4;AA=1 GT:YY:XX:PL:AA 0/1:3:1:2:1 0/1:3:1:2:1 diff --git a/test/test.pl b/test/test.pl index 9b02f9b57..98c8d61bd 100755 --- a/test/test.pl +++ b/test/test.pl @@ -413,12 +413,13 @@ test_vcf_annotate($opts,in=>'annotate20.dst',vcf=>'annotate20.src',out=>'annotate20.3.out',args=>'-c -FMT/GT'); test_vcf_annotate($opts,in=>'annotate.multi',tab=>'annotate.multi',out=>'annotate.multi.1.out',args=>'-c CHROM,POS,REF,ALT,ANN -l ANN:append'); test_vcf_annotate($opts,in=>'annotate.missing-append',tab=>'annotate.missing-append',out=>'annotate.missing-append.1.out',args=>'-c CHROM,POS,REF,ALT,STR,INT,FLT -l STR:append-missing,INT:append-missing,FLT:append-missing'); +test_vcf_annotate($opts,in=>'annotate9',tab=>'annots9',out=>'annotate9.out',args=>'-c CHROM,POS,REF,ALT,+ID'); +test_vcf_annotate($opts,in=>'annotate21',out=>'annotate29.out',args=>'--rename-annots {PATH}/annotate21.txt'); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+missing2ref --no-version'); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+setGT --no-version',args=>'-- -t . -n 0'); test_vcf_plugin($opts,in=>'setGT',out=>'setGT.1.out',cmd=>'+setGT --no-version',args=>'-- -t q -n 0 -i \'GT~"." && FMT/DP=30 && GQ=150\''); test_vcf_plugin($opts,in=>'setGT.2',out=>'setGT.2.out',cmd=>'+setGT --no-version',args=>'-- -t q -n . -i \'GT[@{QPATH}/setGT.samples.txt]="het"\''); test_vcf_plugin($opts,in=>'setGT.2',out=>'setGT.3.out',cmd=>'+setGT --no-version',args=>'-- -t q -n . -i \'GT[@{QPATH}/setGT.samples.txt]="het" & binom(AD[@{QPATH}/setGT.samples.txt])<0.1\''); -test_vcf_annotate($opts,in=>'annotate9',tab=>'annots9',out=>'annotate9.out',args=>'-c CHROM,POS,REF,ALT,+ID'); test_vcf_plugin($opts,in=>'plugin1',out=>'fill-AN-AC.out',cmd=>'+fill-AN-AC --no-version'); test_vcf_plugin($opts,in=>'dosage',out=>'dosage.1.out',cmd=>'+dosage',args=>'-- -t PL'); test_vcf_plugin($opts,in=>'dosage',out=>'dosage.2.out',cmd=>'+dosage',args=>'-- -t GL'); @@ -1321,6 +1322,7 @@ sub test_vcf_annotate { my ($opts,%args) = @_; my ($annot_fname,$in_fname,$hdr); + $args{args} =~ s/{PATH}/$$opts{path}/g; if ( exists($args{tab}) ) { bgzip_tabix($opts,file=>$args{tab},suffix=>'tab',args=>'-s1 -b2 -e2'); diff --git a/vcfannotate.c b/vcfannotate.c index 15a23bf35..ddb5a88f6 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -154,7 +154,7 @@ typedef struct _args_t kstring_t tmpks; char **argv, *output_fname, *targets_fname, *regions_list, *header_fname; - char *remove_annots, *columns, *rename_chrs, *sample_names, *mark_sites; + char *remove_annots, *columns, *rename_chrs, *rename_annots, *sample_names, *mark_sites; kstring_t merge_method_str; int argc, drop_header, record_cmd_line, tgts_is_vcf, mark_sites_logic, force, single_overlaps; int columns_is_file, has_append_mode; @@ -2532,6 +2532,42 @@ static void rename_chrs(args_t *args, char *fname) free(map); } +static void rename_annots(args_t *args, char *fname) +{ + int n, i; + char **map = hts_readlist(fname, 1, &n); + if ( !map ) error("Could not read: %s\n", fname); + for (i=0; ihdr_out, BCF_DT_ID, sb); + if ( id<0 ) continue; + bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", sb, NULL); + if ( !hrec ) continue; // the sequence not present + int j = bcf_hrec_find_key(hrec, "ID"); + assert( j>=0 ); + free(hrec->vals[j]); + ss++; + while ( *ss && isspace(*ss) ) ss++; + char *se = ss; + while ( *se && !isspace(*se) ) se++; + *se = 0; + hrec->vals[j] = strdup(ss); + args->hdr_out->id[BCF_DT_ID][id].key = hrec->vals[j]; + } + for (i=0; ihdr = args->files->readers[0].header; @@ -2597,6 +2633,7 @@ static void init_data(args_t *args) if ( !args->drop_header ) { if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); + if ( args->rename_annots ) rename_annots(args, args->rename_annots); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); @@ -2969,6 +3006,7 @@ static void usage(args_t *args) fprintf(stderr, " -O, --output-type [b|u|z|v] b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n"); fprintf(stderr, " -r, --regions REGION restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE restrict to regions listed in FILE\n"); + fprintf(stderr, " --rename-annots FILE rename annotations: TYPE/old\\tnew, where TYPE is one of FILTER,INFO,FORMAT\n"); fprintf(stderr, " --rename-chrs FILE rename sequences according to the mapping: old\\tnew\n"); fprintf(stderr, " -s, --samples [^]LIST comma separated list of samples to annotate (or exclude with \"^\" prefix)\n"); fprintf(stderr, " -S, --samples-file [^]FILE file of samples to annotate (or exclude with \"^\" prefix)\n"); @@ -3011,6 +3049,7 @@ int main_vcfannotate(int argc, char *argv[]) {"remove",required_argument,NULL,'x'}, {"columns-file",required_argument,NULL,'C'}, {"columns",required_argument,NULL,'c'}, + {"rename-annots",required_argument,NULL,11}, {"rename-chrs",required_argument,NULL,1}, {"header-lines",required_argument,NULL,'h'}, {"samples",required_argument,NULL,'s'}, @@ -3071,6 +3110,7 @@ int main_vcfannotate(int argc, char *argv[]) case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 10 : args->single_overlaps = 1; break; + case 11 : args->rename_annots = optarg; break; case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } From 929d9ad6291354cb96ea9d5c63b5636e7858f3bc Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 20 Nov 2020 12:30:19 +0000 Subject: [PATCH 15/81] More informative error message. Resolves #1128 --- filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/filter.c b/filter.c index fc89c129a..e375fb2e3 100644 --- a/filter.c +++ b/filter.c @@ -2755,7 +2755,8 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { int is_info = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_INFO,tok->hdr_id) ? 1 : 0; is_fmt = bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ? 1 : 0; - if ( is_info && is_fmt ) error("Both INFO/%s and FORMAT/%s exist, which one do you want?\n", tmp.s,tmp.s); + if ( is_info && is_fmt ) + error("Error: ambiguous filtering expression, both INFO/%s and FORMAT/%s are defined in the VCF header.\n" , tmp.s,tmp.s); } if ( is_fmt==-1 ) is_fmt = 0; } From 0987715cd8700a9db398166ce055783054e8d193 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 20 Nov 2020 13:32:45 +0000 Subject: [PATCH 16/81] Prevent internal dropping of samples on VCF input and VCF/BCF output VCF parsing skips FORMAT fields when instructed by setting max_unpack. That's a desirable speed optimization when they are not needed, but in this case the caller requested the feature by a programming mistake which lead to a cryptic error message [E::bcf_write] Broken VCF record, the number of columns at XY does not match the number of samples Fixes #1349 --- plugins/split-vep.c | 3 ++- test/split-vep.20.out | 1 + test/split-vep.7.vcf | 6 ++++++ test/test.pl | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 test/split-vep.20.out create mode 100644 test/split-vep.7.vcf diff --git a/plugins/split-vep.c b/plugins/split-vep.c index 4b76cb20e..b84ac3413 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -573,8 +573,9 @@ static void init_data(args_t *args) int max_unpack = args->convert ? convert_max_unpack(args->convert) : 0; args->filter = filter_init(args->hdr_out, args->filter_str); max_unpack |= filter_max_unpack(args->filter); + if ( !args->format_str ) max_unpack |= BCF_UN_FMT; // don't drop FMT fields on VCF input when VCF/BCF is output args->sr->max_unpack = max_unpack; - if ( max_unpack & BCF_UN_FMT ) + if ( args->convert && (max_unpack & BCF_UN_FMT) ) convert_set_option(args->convert, subset_samples, &args->smpl_pass); } diff --git a/test/split-vep.20.out b/test/split-vep.20.out new file mode 100644 index 000000000..8fde3819d --- /dev/null +++ b/test/split-vep.20.out @@ -0,0 +1 @@ +chr1 817186 . G A 50 PASS ANN=A|upstream_gene_variant|MODIFIER|FAM87B|ENSG00000177757|Transcript|ENST00000326734|lncRNA||||||||||rs3094315|185|1||SNV|HGNC|HGNC:32236|YES||2||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|intron_variant&non_coding_transcript_variant|MODIFIER|AL669831.4|ENSG00000230092|Transcript|ENST00000447500|processed_transcript||1/4|ENST00000447500.4:n.340+187C>T|||||||rs3094315||-1||SNV|Clone_based_ensembl_gene||YES||5||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|intron_variant&non_coding_transcript_variant|MODIFIER|AL669831.3|ENSG00000230021|Transcript|ENST00000634337|processed_transcript||1/4|ENST00000634337.2:n.127+10484C>T|||||||rs3094315||-1||SNV|Clone_based_ensembl_gene||||5||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|intron_variant&non_coding_transcript_variant|MODIFIER|AL669831.3|ENSG00000230021|Transcript|ENST00000635509|processed_transcript||1/3|ENST00000635509.2:n.100+10484C>T|||||||rs3094315||-1||SNV|Clone_based_ensembl_gene||||5||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|regulatory_region_variant|MODIFIER|||RegulatoryFeature|ENSR00000000085|promoter||||||||||rs3094315||||SNV|||||||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||;IMPACT=MODIFIER,MODIFIER,MODIFIER,MODIFIER,MODIFIER GT 1|1 diff --git a/test/split-vep.7.vcf b/test/split-vep.7.vcf new file mode 100644 index 000000000..9ca2601c3 --- /dev/null +++ b/test/split-vep.7.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +chr1 817186 . G A 50 PASS ANN=A|upstream_gene_variant|MODIFIER|FAM87B|ENSG00000177757|Transcript|ENST00000326734|lncRNA||||||||||rs3094315|185|1||SNV|HGNC|HGNC:32236|YES||2||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|intron_variant&non_coding_transcript_variant|MODIFIER|AL669831.4|ENSG00000230092|Transcript|ENST00000447500|processed_transcript||1/4|ENST00000447500.4:n.340+187C>T|||||||rs3094315||-1||SNV|Clone_based_ensembl_gene||YES||5||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|intron_variant&non_coding_transcript_variant|MODIFIER|AL669831.3|ENSG00000230021|Transcript|ENST00000634337|processed_transcript||1/4|ENST00000634337.2:n.127+10484C>T|||||||rs3094315||-1||SNV|Clone_based_ensembl_gene||||5||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|intron_variant&non_coding_transcript_variant|MODIFIER|AL669831.3|ENSG00000230021|Transcript|ENST00000635509|processed_transcript||1/3|ENST00000635509.2:n.100+10484C>T|||||||rs3094315||-1||SNV|Clone_based_ensembl_gene||||5||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721|||||,A|regulatory_region_variant|MODIFIER|||RegulatoryFeature|ENSR00000000085|promoter||||||||||rs3094315||||SNV|||||||||||||||||||0.3873|0.804|0.8839|0.84|0.8088||||||||||||0.8839|EAS||||21159730&21492446&23664118&19096721||||| GT 1|1 diff --git a/test/test.pl b/test/test.pl index 98c8d61bd..a4cc3cf5c 100755 --- a/test/test.pl +++ b/test/test.pl @@ -509,6 +509,7 @@ test_vcf_plugin($opts,in=>'split-vep.6',out=>'split-vep.17.out',cmd=>'+split-vep',args=>qq[-c SAS_AF | grep ID=SAS_AF]); test_vcf_plugin($opts,in=>'split-vep.6',out=>'split-vep.18.out',cmd=>'+split-vep',args=>qq[-c - | grep -v ^#]); test_vcf_plugin($opts,in=>'split-vep.6',out=>'split-vep.19.out',cmd=>'+split-vep',args=>qq[-c - -s worst | grep -v ^#]); +test_vcf_plugin($opts,in=>'split-vep.7',out=>'split-vep.20.out',cmd=>'+split-vep',args=>qq[--annotation 'ANN' -c IMPACT -i 'INFO/IMPACT[*] ~ "MODIFIER"' | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]); From 77bd35da169fb1809cded3c031f143f0e6069d87 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 20 Nov 2020 22:03:40 +0000 Subject: [PATCH 17/81] Various user-visible typo fixes --- filter.c | 2 +- plugins/contrast.c | 2 +- plugins/indel-stats.c | 2 +- plugins/parental-origin.c | 2 +- plugins/split-vep.c | 2 +- test/contrast.out | 2 +- test/indel-stats.1.out | 2 +- test/indel-stats.2.out | 2 +- test/indel-stats.3.out | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/filter.c b/filter.c index e375fb2e3..56f555d63 100644 --- a/filter.c +++ b/filter.c @@ -1086,7 +1086,7 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) } static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack) { - if ( nstack==0 ) error("Error parsing the expresion\n"); + if ( nstack==0 ) error("Error parsing the expression\n"); token_t *tok = stack[nstack - 1]; if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); diff --git a/plugins/contrast.c b/plugins/contrast.c index 88a45cf73..bdc30483f 100644 --- a/plugins/contrast.c +++ b/plugins/contrast.c @@ -197,7 +197,7 @@ static void init_data(args_t *args) args->hdr = bcf_sr_get_header(args->sr,0); args->hdr_out = bcf_hdr_dup(args->hdr); if ( args->annots & PRINT_PASSOC ) - bcf_hdr_append(args->hdr_out, "##INFO="); + bcf_hdr_append(args->hdr_out, "##INFO="); if ( args->annots & PRINT_FASSOC ) bcf_hdr_append(args->hdr_out, "##INFO="); if ( args->annots & PRINT_NASSOC ) diff --git a/plugins/indel-stats.c b/plugins/indel-stats.c index ab75619ca..fd0a009ed 100644 --- a/plugins/indel-stats.c +++ b/plugins/indel-stats.c @@ -355,7 +355,7 @@ static void report_stats(args_t *args) fprintf(fh,"# DLEN* lines report indel length distribution for every threshold. When genotype fields are available,\n"); fprintf(fh,"# the counts correspond to the number of genotypes, otherwise the number of sites are given.\n"); fprintf(fh,"# The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions.\n"); - fprintf(fh,"# The firt/last bin contains also all deletions/insertions larger than MAX_LEN:\n"); + fprintf(fh,"# The first/last bin contains also all deletions/insertions larger than MAX_LEN:\n"); fprintf(fh,"# %d) DLEN*, filter id\n", ++i); fprintf(fh,"# %d) maximum indel length\n", ++i); fprintf(fh,"# %d-%d) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x)\n", i+1, i+MAX_LEN*2+1); diff --git a/plugins/parental-origin.c b/plugins/parental-origin.c index 944e58f26..7bbc02b82 100644 --- a/plugins/parental-origin.c +++ b/plugins/parental-origin.c @@ -95,7 +95,7 @@ static const char *usage_text(void) " -b, --min-binom-prob FLOAT exclude parental HETs with skewed ALT allele fraction [1e-2]\n" " -d, --debug list informative sites\n" " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" - " -g, --greedy use also ambigous sites, e.g. het+hom parents for deletions\n" + " -g, --greedy use also ambiguous sites, e.g. het+hom parents for deletions\n" " -i, --include EXPR include sites and samples for which the expression is true\n" " -p, --pfm P,F,M sample names of proband, father, and mother\n" " -r, --region REGION chr:beg-end\n" diff --git a/plugins/split-vep.c b/plugins/split-vep.c index b84ac3413..73dee9559 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -435,7 +435,7 @@ static void init_data(args_t *args) ptr[str.l] = 0; int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, ptr+1); if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) ) - fprintf(stderr,"Note: ambigous key %s, using the %s subfield of %s, not the INFO/%s tag\n", ptr,ptr+1,args->vep_tag,ptr+1); + fprintf(stderr,"Note: ambiguous key %s, using the %s subfield of %s, not the INFO/%s tag\n", ptr,ptr+1,args->vep_tag,ptr+1); int olen = args->column_str ? strlen(args->column_str) : 0; int nlen = strlen(ptr) - 1; diff --git a/test/contrast.out b/test/contrast.out index 760fb60b8..c08a57367 100644 --- a/test/contrast.out +++ b/test/contrast.out @@ -2,7 +2,7 @@ ##FILTER= ##FORMAT= ##contig= -##INFO= +##INFO= ##INFO= ##INFO= ##INFO= diff --git a/test/indel-stats.1.out b/test/indel-stats.1.out index f48638387..635bea380 100644 --- a/test/indel-stats.1.out +++ b/test/indel-stats.1.out @@ -20,7 +20,7 @@ # DLEN* lines report indel length distribution for every threshold. When genotype fields are available, # the counts correspond to the number of genotypes, otherwise the number of sites are given. # The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions. -# The firt/last bin contains also all deletions/insertions larger than MAX_LEN: +# The first/last bin contains also all deletions/insertions larger than MAX_LEN: # 1) DLEN*, filter id # 2) maximum indel length # 3-43) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x) diff --git a/test/indel-stats.2.out b/test/indel-stats.2.out index c0ce40c8a..551caa4e6 100644 --- a/test/indel-stats.2.out +++ b/test/indel-stats.2.out @@ -20,7 +20,7 @@ # DLEN* lines report indel length distribution for every threshold. When genotype fields are available, # the counts correspond to the number of genotypes, otherwise the number of sites are given. # The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions. -# The firt/last bin contains also all deletions/insertions larger than MAX_LEN: +# The first/last bin contains also all deletions/insertions larger than MAX_LEN: # 1) DLEN*, filter id # 2) maximum indel length # 3-43) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x) diff --git a/test/indel-stats.3.out b/test/indel-stats.3.out index f75bb3995..3d041e444 100644 --- a/test/indel-stats.3.out +++ b/test/indel-stats.3.out @@ -20,7 +20,7 @@ # DLEN* lines report indel length distribution for every threshold. When genotype fields are available, # the counts correspond to the number of genotypes, otherwise the number of sites are given. # The k-th bin corresponds to the indel size k-MAX_LEN, negative for deletions, positive for insertions. -# The firt/last bin contains also all deletions/insertions larger than MAX_LEN: +# The first/last bin contains also all deletions/insertions larger than MAX_LEN: # 1) DLEN*, filter id # 2) maximum indel length # 3-43) counts of indel lengths (-max,..,0,..,max), all unique alleles in a genotype are recorded (alt hets increase the counters 2x, alt homs 1x) From c22939f6ea5b1035a9f5cb76cde42dd7158b2932 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 21 Nov 2020 22:45:23 +0000 Subject: [PATCH 18/81] Remove generated test/*.fa.fai files These files are created as necessary by the relevant test command. Clean and ignore them, and clean all the variantkey test output files. --- .gitignore | 1 + Makefile | 4 ++-- test/23andme.fa.fai | 3 --- test/aa.fa.fai | 8 -------- test/csq.fa.fai | 3 --- test/fixref.3.fa.fai | 8 -------- test/gvcf.fa.fai | 1 - test/merge.gvcf.10.fa.fai | 1 - test/norm.2.fa.fai | 1 - test/norm.fa.fai | 8 -------- test/norm.iupac.fa.fai | 1 - 11 files changed, 3 insertions(+), 36 deletions(-) delete mode 100644 test/23andme.fa.fai delete mode 100644 test/aa.fa.fai delete mode 100644 test/csq.fa.fai delete mode 100644 test/fixref.3.fa.fai delete mode 100644 test/gvcf.fa.fai delete mode 100644 test/merge.gvcf.10.fa.fai delete mode 100644 test/norm.2.fa.fai delete mode 100644 test/norm.fa.fai delete mode 100644 test/norm.iupac.fa.fai diff --git a/.gitignore b/.gitignore index ce3fcfce4..37dd60585 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ configure /TAGS +test/*.fa.fai test/vkrs.unsorted.hex test/rsvk.unsorted.hex test/nrvk.unsorted.tsv diff --git a/Makefile b/Makefile index a4fe437c0..a60b409c8 100644 --- a/Makefile +++ b/Makefile @@ -342,8 +342,8 @@ clean-plugins: -rm -rf plugins/*.dSYM testclean: - -rm -f test/*.o test/*~ $(TEST_PROGRAMS) - -rm -f test/*.hex + -rm -f test/*.o test/*.fa.fai test/*~ $(TEST_PROGRAMS) + -rm -f test/*.unsorted.hex test/*.unsorted.tsv distclean: clean -rm -f config.cache config.h config.log config.mk config.status diff --git a/test/23andme.fa.fai b/test/23andme.fa.fai deleted file mode 100644 index 6489fa3ad..000000000 --- a/test/23andme.fa.fai +++ /dev/null @@ -1,3 +0,0 @@ -1 150 3 26 27 -2 77 162 26 27 -Y 40 245 26 27 diff --git a/test/aa.fa.fai b/test/aa.fa.fai deleted file mode 100644 index b718af4e1..000000000 --- a/test/aa.fa.fai +++ /dev/null @@ -1,8 +0,0 @@ -20 301 4 60 61 -1 301 314 60 61 -2 215 642 60 61 -3 60 864 60 61 -4 60 928 60 61 -5 31 992 31 32 -21 60 1028 60 61 -22 30 1093 30 31 diff --git a/test/csq.fa.fai b/test/csq.fa.fai deleted file mode 100644 index 12a5aa601..000000000 --- a/test/csq.fa.fai +++ /dev/null @@ -1,3 +0,0 @@ -1 600 3 60 61 -2 600 616 60 61 -3 600 1229 60 61 diff --git a/test/fixref.3.fa.fai b/test/fixref.3.fa.fai deleted file mode 100644 index 78a62b19e..000000000 --- a/test/fixref.3.fa.fai +++ /dev/null @@ -1,8 +0,0 @@ -1 21 3 21 22 -2 21 28 21 22 -3 21 53 21 22 -4 21 78 21 22 -5 21 103 21 22 -6 21 128 21 22 -7 21 153 21 22 -8 21 178 21 22 diff --git a/test/gvcf.fa.fai b/test/gvcf.fa.fai deleted file mode 100644 index 4f07c212e..000000000 --- a/test/gvcf.fa.fai +++ /dev/null @@ -1 +0,0 @@ -22 460 4 60 61 diff --git a/test/merge.gvcf.10.fa.fai b/test/merge.gvcf.10.fa.fai deleted file mode 100644 index 45750de43..000000000 --- a/test/merge.gvcf.10.fa.fai +++ /dev/null @@ -1 +0,0 @@ -chr1 4 6 4 5 diff --git a/test/norm.2.fa.fai b/test/norm.2.fa.fai deleted file mode 100644 index 9419d0da4..000000000 --- a/test/norm.2.fa.fai +++ /dev/null @@ -1 +0,0 @@ -1 20 25 20 21 diff --git a/test/norm.fa.fai b/test/norm.fa.fai deleted file mode 100644 index da3fb859e..000000000 --- a/test/norm.fa.fai +++ /dev/null @@ -1,8 +0,0 @@ -20 301 23 60 61 -1 300 347 60 61 -2 215 673 60 61 -3 60 902 60 61 -4 60 985 60 61 -5 31 1070 31 32 -21 60 1106 60 61 -22 30 1171 30 31 diff --git a/test/norm.iupac.fa.fai b/test/norm.iupac.fa.fai deleted file mode 100644 index 36c6427b5..000000000 --- a/test/norm.iupac.fa.fai +++ /dev/null @@ -1 +0,0 @@ -1 28 25 20 21 From 3200bffffe30e61f6085130aa732bc7cb6f86c6f Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 24 Nov 2020 15:44:54 +0000 Subject: [PATCH 19/81] Include directly where needed Other source files that use strcase*() functions #include themselves, so do so for these two source files too. ( is often a byproduct of but POSIX doesn't require that.) --- vcffilter.c | 1 + vcfgtcheck.c | 1 + 2 files changed, 2 insertions(+) diff --git a/vcffilter.c b/vcffilter.c index f9a9e9bea..b725ed087 100644 --- a/vcffilter.c +++ b/vcffilter.c @@ -28,6 +28,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include diff --git a/vcfgtcheck.c b/vcfgtcheck.c index 1f0657d3e..75dc026ec 100644 --- a/vcfgtcheck.c +++ b/vcfgtcheck.c @@ -29,6 +29,7 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include From dc1182b5987312c35c364bc3627647f2e71f1dfa Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 25 Nov 2020 09:59:33 +0000 Subject: [PATCH 20/81] Add missing gtcheck tests --- test/test.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test.pl b/test/test.pl index a4cc3cf5c..d59770a3c 100755 --- a/test/test.pl +++ b/test/test.pl @@ -665,6 +665,8 @@ test_gtcheck($opts,in=>'gtcheck.1',gts=>'gtcheck.1.gts',out=>'gtcheck.10.out',args=>q[-u GT -e 30 -p s1,s1]); test_gtcheck($opts,in=>'gtcheck.1',gts=>'gtcheck.1.gts',out=>'gtcheck.11.out',args=>q[-u GT -e 300]); test_gtcheck($opts,in=>'gtcheck.3',out=>'gtcheck.12.out',args=>q[-u PL -e 30]); +test_gtcheck($opts,in=>'gtcheck.ntop',gts=>'gtcheck.ntop.gts',out=>'gtcheck.ntop.1.out',args=>q[]); +test_gtcheck($opts,in=>'gtcheck.ntop',gts=>'gtcheck.ntop.gts',out=>'gtcheck.ntop.2.out',args=>q[--n-matches 2]); print "\nNumber of tests:\n"; printf " total .. %d\n", $$opts{nok}+$$opts{nfailed}; From 09f2dbdaf1462ba9f6814d6a8ce1223930a21282 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 25 Nov 2020 10:03:26 +0000 Subject: [PATCH 21/81] Changes to QUAL and ts/tv plotting stats - avoid capping QUAL to predefined bins, use an open-range logarithmic binning instead (stable but experimental) - plot dual ts/tv stats: per quality bin and cumulative as if threshold applied the whole dataset --- Makefile | 5 +- dist.c | 124 ++++++++++++++++++++++++++++++++ dist.h | 98 ++++++++++++++++++++++++++ misc/plot-vcfstats | 97 +++++++++++++++++++++++-- test/stats.B.chk | 2 +- test/stats.chk | 2 +- test/stats.counts.chk | 2 +- vcfstats.c | 160 ++++++++++++++++++++++++++---------------- 8 files changed, 419 insertions(+), 71 deletions(-) create mode 100644 dist.c create mode 100644 dist.h diff --git a/Makefile b/Makefile index a4fe437c0..4d292a72d 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ OBJS = main.o vcfindex.o tabix.o \ vcfcnv.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ - vcfsort.o cols.o extsort.o \ + vcfsort.o cols.o extsort.o dist.o \ ccall.o em.o prob1.o kmin.o # the original samtools calling PLUGIN_OBJS = vcfplugin.o @@ -249,7 +249,7 @@ vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstrin vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) vcfsort.o: vcfsort.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_hts_os_h) kheap.h $(bcftools_h) -vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h +vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) bin.h dist.h vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(htslib_khash_str2int_h) reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(htslib_thread_pool_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) $(khash_str2str_h) tabix.o: tabix.c $(htslib_bgzf_h) $(htslib_tbx_h) @@ -268,6 +268,7 @@ ploidy.o: ploidy.c $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $( polysomy.o: polysomy.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(bcftools_h) peakfit.h peakfit.o: peakfit.c peakfit.h $(htslib_hts_h) $(htslib_kstring_h) bin.o: bin.c $(bcftools_h) bin.h +dist.o: dist.c dist.h cols.o: cols.c cols.h regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h) diff --git a/dist.c b/dist.c new file mode 100644 index 000000000..094fc73d2 --- /dev/null +++ b/dist.c @@ -0,0 +1,124 @@ +/* The MIT License + + Copyright (c) 2016-2020 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include +#include "dist.h" + +extern void error(const char *format, ...); + +struct _dist_t +{ + uint64_t *bins, nvalues; + int nbins; + int npow; // the number of orders of magnitude to represent exactly + int nexact; // pow(10,npow) + int nlevel; +}; + +dist_t *dist_init(int npow) +{ + dist_t *dist = (dist_t*) calloc(1,sizeof(dist_t)); + dist->npow = npow; + dist->nexact = pow(10,npow); + dist->nlevel = dist->nexact - pow(10,npow-1); + return dist; +} + +void dist_destroy(dist_t *dist) +{ + if ( !dist ) return; + free(dist->bins); + free(dist); +} + +int dist_nbins(dist_t *dist) +{ + return dist->nbins; +} + +int dist_nvalues(dist_t *dist) +{ + return dist->nvalues; +} + +uint32_t dist_insert(dist_t *dist, uint32_t value) +{ + int ibin; + + if ( value <= dist->nexact ) + ibin = value; + else + { + int npow = (int) log10(value); + int level = npow - dist->npow + 1; + uint32_t step = pow(10, level); + ibin = dist->nexact + dist->nlevel*(level-1) + (value - pow(10,npow)) / step; + } + + if ( ibin >= dist->nbins ) + { + dist->bins = (uint64_t*) realloc(dist->bins, sizeof(*dist->bins)*(ibin+1)); + memset(dist->bins + dist->nbins, 0, (ibin+1 - dist->nbins)*sizeof(*dist->bins)); + dist->nbins = ibin+1; + } + dist->bins[ibin]++; + dist->nvalues++; + return ibin; +} +uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt) +{ + if ( !cnt ) return 0; + int ibin = dist_insert(dist, value); + dist->bins[ibin] += cnt - 1; + dist->nvalues += cnt; + return ibin; +} + +uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end) +{ + if ( idx < dist->nexact ) + { + if ( beg ) *beg = idx; + if ( end ) *end = idx + 1; + } + else + { + int level = (idx - dist->nexact) / dist->nlevel + 1; + int bin = idx - dist->nexact - dist->nlevel*(level-1); + + uint32_t step = pow(10, level); + uint32_t value = pow(10, level + dist->npow - 1) + step*bin; + + if ( beg ) *beg = value; + if ( end ) *end = value + step; + } + return dist->bins[idx]; +} + diff --git a/dist.h b/dist.h new file mode 100644 index 000000000..5c9c57181 --- /dev/null +++ b/dist.h @@ -0,0 +1,98 @@ +/* The MIT License + + Copyright (c) 2016-2020 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ +/* + Logarithmic binning + + Example of usage: + + // Initialize, make the binning exact up to 10^4, then add a log-step + dist_t *dist = dist_init(4); + + // Insert values + int i; + for (i=0; i<1e6; i++) + dist_insert(dist, i); + + // Number of bins used + int n = dist_n(dist); + + // Now print the distribution + uint32_t beg, end; + for (i=0; i +#include + +typedef struct _dist_t dist_t; + +/* + * dist_init() - init bins + */ +dist_t *dist_init(int npow); +void dist_destroy(dist_t *dist); + +/* + dist_nbins() - get the number of bins + */ +int dist_nbins(dist_t *dist); + +/* + dist_nvalues() - get the total number of values inserted + */ +int dist_nvalues(dist_t *dist); + +/* + dist_insert() - insert new value + dist_insert_n() - insert new value n times + */ +uint32_t dist_insert(dist_t *dist, uint32_t value); +uint32_t dist_insert_n(dist_t *dist, uint32_t value, uint32_t cnt); + +/* + dist_get() + @idx: from the interval [0,dist_n-1] + @beg,end: [beg,end) + */ +uint64_t dist_get(dist_t *dist, uint32_t idx, uint32_t *beg, uint32_t *end); + +#endif + diff --git a/misc/plot-vcfstats b/misc/plot-vcfstats index f944eb9ed..58e8a3bd2 100755 --- a/misc/plot-vcfstats +++ b/misc/plot-vcfstats @@ -60,6 +60,7 @@ if ( $$opts{make_plots} ) { plot_tstv_by_AF($opts,$id); plot_tstv_by_QUAL($opts,$id); + plot_tstv_by_usr($opts,$id); plot_indel_distribution($opts,$id); plot_indel_vaf_distribution($opts,$id); plot_substitutions($opts,$id); @@ -119,6 +120,7 @@ sub parse_params $0 =~ s{^.+/}{}; my $opts = { + usr_plots => [], use_sample_names => 0, verbose => 1, make_pdf => 1, @@ -763,6 +765,7 @@ sub init_plots plot_indel_dist = 1 plot_indel_vaf = 1 plot_tstv_by_qual = 1 + plot_tstv_by_usr = 1 plot_substitutions = 1 @@ -828,8 +831,8 @@ sub get_values if ( !exists($$opts{dat}{$key}) ) { return (); } if ( !exists($$opts{dat}{$key}{$id}) ) { return (); } my $fields_ok = 1; - if ( !exists($$opts{exp}{$key}) ) { error("todo: sanity check for $key\n"); } - if ( exists($$opts{def_line}{$key}) && $$opts{def_line}{$key} ne $$opts{exp}{$key} && !$$opts{def_line_warned}{$key} ) + if ( !exists($$opts{exp}{$key}) && !($key=~/^USR:/) ) { error("todo: sanity check for $key\n"); } + if ( exists($$opts{def_line}{$key}) && !($key=~/^USR:/) && $$opts{def_line}{$key} ne $$opts{exp}{$key} && !$$opts{def_line_warned}{$key} ) { warn("Warning: Possible version mismatch, the definition line differs\n\texpected: $$opts{exp}{$key}\n\tfound: $$opts{def_line}{$key}\n"); $$opts{def_line_warned}{$key} = 1; @@ -1209,6 +1212,9 @@ sub plot_tstv_by_AF "; } +# Sort calls by quality and plot ts/tv in bins of reasonable size (1%); +# Plot both cumulative ts/tv (as if quality threshold was applied at that point) +# and per bin. sub plot_tstv_by_QUAL { my ($opts,$id) = @_; @@ -1225,21 +1231,24 @@ sub plot_tstv_by_QUAL my $ntot = 0; for my $val (@vals) { + if ( $$val[0] eq '.' ) { next; } push @dat, [ $$val[0], $$val[2], $$val[3] ]; # qual, nts, ntv $ntot += $$val[2] + $$val[3]; } my @sdat = sort { $$b[0] <=> $$a[0] } @dat; push @sdat, [-1]; - my $dn = $ntot*0.05; + my $dn = $ntot*0.01; my $qprev = $sdat[0][0]; + my $nout = 0; my $nts = 0; my $ntv = 0; - my $nout = 0; - for my $rec (@sdat) + my $nts_tot = 0; + my $ntv_tot = 0; + for my $rec (@sdat) # sorted in descendent order, biggest qualities come first { if ( $$rec[0]==-1 or $nts+$ntv > $dn ) { - if ( $ntv ) { printf $tfh "$qprev\t%d\t%f\n", $nts+$ntv+$nout,$nts/$ntv; } + if ( $ntv ) { printf $tfh "$qprev\t%d\t%f\t%f\n", $nts+$ntv+$nout,$nts/$ntv,$nts_tot/$ntv_tot; } if ( $$rec[0]==-1 ) { last; } $nout += $nts+$ntv; $nts = 0; @@ -1248,6 +1257,8 @@ sub plot_tstv_by_QUAL } $nts += $$rec[1]; $ntv += $$rec[2]; + $nts_tot += $$rec[1]; + $ntv_tot += $$rec[2]; } close($tfh) or error("close $img.dat"); @@ -1262,12 +1273,77 @@ sub plot_tstv_by_QUAL if plot_tstv_by_qual and len(dat)>2: fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) ax1 = fig.add_subplot(111) - ax1.plot([row[1] for row in dat], [row[2] for row in dat], '^-', ms=3, mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]') + ax1.plot([row[1] for row in dat], [row[3] for row in dat], '-', ms=1, mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]', label='Cumulative ts/tv') + ax1.plot([row[1] for row in dat], [row[2] for row in dat], '--', ms=1, mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]', label='Per 1% bins') ax1.set_ylabel('Ts/Tv',fontsize=10) ax1.set_xlabel('Number of sites\\n(sorted by QUAL, descending)',fontsize=10) ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='x') ax1.set_ylim(min(2,min(row[2] for row in dat))-0.3,0.3+max(2.2,max(row[2] for row in dat))) + plt.legend(numpoints=1,markerscale=2,loc='best',prop={'size':9},frameon=False) + plt.subplots_adjust(right=0.88,left=0.15,bottom=0.15) + plt.title('$$opts{title}{$id}') + plt.savefig('$img.png') + if img_fmt != 'png': plt.savefig('$img.' + img_fmt) + plt.close() + + "; +} + +sub plot_tstv_by_usr +{ + my ($opts,$id) = @_; + for my $key (keys %{$$opts{dat}}) + { + if ( !($key=~/^USR:/) ) { next; } + plot_tstv_by_usr1($opts,$id,$key); + } +} +sub plot_tstv_by_usr1 +{ + my ($opts,$id,$name) = @_; + my @vals = get_values($opts,$id,$name); + if ( !@vals ) { return; } + + my $fname = $name; + $fname =~ s{/}{_}g; + push @{$$opts{usr_plots}},{fname=>"tstv_by_$fname",uname=>$name}; + + my $fh = $$opts{plt_fh}; + my $img = "tstv_by_$fname.$id"; + + open(my $tfh,'>',"$img.dat") or error("$img.dat: $!"); + print $tfh "# [1]$name\t[2]Number of sites\t[3]Marginal Ts/Tv\n"; + for my $rec (@vals) + { + my ($val,$nts,$ntv,undef) = @$rec; + if ( $nts+$ntv==0 ) { next; } + printf $tfh "$val\t%d\t%f\n", $nts+$ntv,$ntv?$nts/$ntv:0; + } + close($tfh) or error("close $img.dat"); + + tprint $fh, " + + dat = [] + with open('$img.dat', 'r') as f: + reader = csv.reader(f, 'tab') + for row in reader: + if row[0][0] != '#': dat.append([float(x) for x in row]) + + if plot_tstv_by_usr and len(dat)>2: + fig = plt.figure(figsize=($$opts{img_width},$$opts{img_height})) + ax1 = fig.add_subplot(111) + ax2 = ax1.twinx() + plots = ax1.plot([row[0] for row in dat], [row[2] for row in dat], 'o', mec='$$opts{id2col}[$id]', color='$$opts{id2col}[$id]') + plots += ax2.plot([row[0] for row in dat], [row[1] for row in dat], 'o', mec='grey', color='grey') + ax1.set_ylabel('Ts/Tv',fontsize=10) + ax2.set_ylabel('Number of sites',fontsize=10) + ax1.set_xlabel('$name',fontsize=10) + ax1.ticklabel_format(style='sci', scilimits=(-3,2), axis='x') + ax2.ticklabel_format(style='sci', scilimits=(-2,2), axis='y') + ax1.set_ylim(min(2,min(row[2] for row in dat))-0.3,0.3+max(2.2,max(row[2] for row in dat))) + labels = ['ts/tv','Number of sites'] + plt.legend(plots,labels,numpoints=1,loc='best',prop={'size':9},frameon=False) plt.subplots_adjust(right=0.88,left=0.15,bottom=0.15) plt.title('$$opts{title}{$id}') plt.savefig('$img.png') @@ -1277,6 +1353,7 @@ sub plot_tstv_by_QUAL "; } + sub rebin_values { my ($vals,$bin_size,$col,%args) = @_; @@ -2105,6 +2182,12 @@ sub create_pdf } tprint $tex, fmt_slide3h($opts, "tstv_by_af", 'Ts/Tv by AF'); tprint $tex, fmt_slide3h($opts, "tstv_by_qual", 'Ts/Tv stratified by QUAL'); + for my $plot (@{$$opts{usr_plots}}) + { + my $uname = $$plot{uname}; + my $fname = $$plot{fname}; + tprint $tex, fmt_slide3h($opts, $fname, 'Ts/Tv stratified by '.$uname); + } tprint $tex, fmt_slide3h($opts, "indels", 'Indel distribution'); tprint $tex, fmt_slide3v($opts, "indel_vaf", 'Fraction of alternate indel allele'); tprint $tex, fmt_slide3h($opts, "depth", 'Depth distribution'); diff --git a/test/stats.B.chk b/test/stats.B.chk index 36725f40b..3e9ebe780 100644 --- a/test/stats.B.chk +++ b/test/stats.B.chk @@ -32,7 +32,7 @@ SiS 1 1 0 0 0 0 0 0 0 SiS 2 1 1 0 1 0 0 0 0 AF 2 0.000000 1 0 1 0 0 0 0 AF 2 0.490000 3 3 0 0 0 0 0 -QUAL 2 998 4 3 1 0 +QUAL 2 . 4 3 1 0 ST 0 A>C 0 ST 0 A>G 0 ST 0 A>T 0 diff --git a/test/stats.chk b/test/stats.chk index 2e4b4f1ef..37b022921 100644 --- a/test/stats.chk +++ b/test/stats.chk @@ -32,7 +32,7 @@ SiS 1 1 0 0 0 0 0 0 0 SiS 2 1 1 0 1 0 0 0 0 AF 2 0.000000 1 0 1 0 0 0 0 AF 2 0.490000 3 3 0 0 0 0 0 -QUAL 2 998 4 3 1 0 +QUAL 2 . 4 3 1 0 ST 0 A>C 0 ST 0 A>G 0 ST 0 A>T 0 diff --git a/test/stats.counts.chk b/test/stats.counts.chk index b8a1d4a85..ee798a03b 100644 --- a/test/stats.counts.chk +++ b/test/stats.counts.chk @@ -12,7 +12,7 @@ SiS 0 1 2 1 1 1 0 0 1 AF 0 0.000000 3 1 2 1 0 0 1 AF 0 0.330000 2 1 1 0 0 0 0 AF 0 0.490000 3 2 1 0 0 0 0 -QUAL 0 998 7 3 3 1 +QUAL 0 . 6 3 3 1 IDD 0 1 1 0 . ST 0 A>C 0 ST 0 A>G 0 diff --git a/vcfstats.c b/vcfstats.c index c8c8cb856..3cd551176 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2017 Genome Research Ltd. + Copyright (C) 2012-2020 Genome Research Ltd. Author: Petr Danecek @@ -41,6 +41,7 @@ THE SOFTWARE. */ #include "bcftools.h" #include "filter.h" #include "bin.h" +#include "dist.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -58,7 +59,7 @@ typedef struct float min, max; uint64_t *vals_ts, *vals_tv; void *val; - int nbins, type, m_val; + int nbins, type, m_val, idx; } user_stats_t; @@ -82,7 +83,9 @@ typedef struct #endif int ts_alt1, tv_alt1; #if QUAL_STATS - int *qual_ts, *qual_tv, *qual_snps, *qual_indels; + // Values are rounded to one significant digit and 1 is added (Q*10+1); missing and negative values go in the first bin + // Only SNPs that are the 1st alternate allele are counted + dist_t *qual_ts, *qual_tv, *qual_indels; #endif int *insertions, *deletions, m_indel; // maximum indel length int in_frame, out_frame, na_frame, in_frame_alt1, out_frame_alt1, na_frame_alt1; @@ -187,13 +190,6 @@ static inline int idist_i2bin(idist_t *d, int i) return i-1+d->min; } -static inline int clip_nonnegative(float x, int limit) -{ - if (x >= limit || isnan(x)) return limit - 1; - else if (x <= 0.0) return 0; - else return (int) x; -} - #define IC_DBG 0 #if IC_DBG static void _indel_ctx_print1(_idc1_t *idc) @@ -350,12 +346,29 @@ static void add_user_stats(args_t *args, char *str) args->usr = (user_stats_t*) realloc(args->usr,sizeof(user_stats_t)*args->nusr); user_stats_t *usr = &args->usr[args->nusr-1]; memset(usr,0,sizeof(*usr)); - usr->min = 0; - usr->max = 1; + usr->min = 0; + usr->max = 1; usr->nbins = 100; + usr->idx = 0; char *tmp = str; while ( *tmp && *tmp!=':' ) tmp++; + + // Tag with an index or just tag? (e.g. PV4[1] vs DP) + if ( tmp > str && tmp[-1]==']' ) + { + char *ptr = tmp; + while ( ptr>str && *ptr!='[' ) ptr--; + if ( *ptr=='[' ) + { + char *ptr2; + usr->idx = strtol(ptr+1, &ptr2, 10); + if ( ptr+1==ptr2 || ptr2 != tmp-1 ) error("Could not parse the index in \"%s\" (ptr=%s;ptr2=%s(%p),tmp=%s(%p),idx=%d)\n", str,ptr,ptr2,ptr2,tmp,tmp,usr->idx); + if ( usr->idx<0 ) error("Error: negative index is not allowed: \"%s\"\n", str); + *ptr = 0; + } + } + usr->tag = (char*)calloc(tmp-str+2,sizeof(char)); memcpy(usr->tag,str,tmp-str); @@ -466,10 +479,9 @@ static void init_stats(args_t *args) int j; for (j=0; j<3; j++) stats->af_repeats[j] = (int*) calloc(args->m_af,sizeof(int)); #if QUAL_STATS - stats->qual_ts = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_tv = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_snps = (int*) calloc(args->m_qual,sizeof(int)); - stats->qual_indels = (int*) calloc(args->m_qual,sizeof(int)); + stats->qual_ts = dist_init(5); + stats->qual_tv = dist_init(5); + stats->qual_indels = dist_init(5); #endif if ( args->files->n_smpl ) { @@ -549,10 +561,9 @@ static void destroy_stats(args_t *args) for (j=0; j<3; j++) if (stats->af_repeats[j]) free(stats->af_repeats[j]); #if QUAL_STATS - if (stats->qual_ts) free(stats->qual_ts); - if (stats->qual_tv) free(stats->qual_tv); - if (stats->qual_snps) free(stats->qual_snps); - if (stats->qual_indels) free(stats->qual_indels); + if (stats->qual_ts) dist_destroy(stats->qual_ts); + if (stats->qual_tv) dist_destroy(stats->qual_tv); + if (stats->qual_indels) dist_destroy(stats->qual_indels); #endif #if HWE_STATS free(stats->af_hwe); @@ -679,8 +690,8 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) bcf1_t *line = reader->buffer[0]; #if QUAL_STATS - int iqual = clip_nonnegative(line->qual, args->m_qual); - stats->qual_indels[iqual]++; + int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); + dist_insert(stats->qual_indels, iqual); #endif // Check if the indel is near an exon for the frameshift statistics @@ -781,7 +792,7 @@ static void do_indel_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts) { - int i; + int i, nval; for (i=0; inusr; i++) { user_stats_t *usr = &stats->usr[i]; @@ -789,13 +800,15 @@ static void do_user_stats(stats_t *stats, bcf_sr_t *reader, int is_ts) float val; if ( usr->type==BCF_HT_REAL ) { - if ( bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue; - val = ((float*)usr->val)[0]; + if ( (nval=bcf_get_info_float(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; + if ( usr->idx >= nval ) continue; + val = ((float*)usr->val)[usr->idx]; } else { - if ( bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val)<=0 ) continue; - val = ((int32_t*)usr->val)[0]; + if ( (nval=bcf_get_info_int32(reader->header,reader->buffer[0],usr->tag,&usr->val,&usr->m_val))<=0 ) continue; + if ( usr->idx >= nval ) continue; + val = ((int32_t*)usr->val)[usr->idx]; } int idx; if ( val<=usr->min ) idx = 0; @@ -814,8 +827,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) if ( ref<0 ) return; #if QUAL_STATS - int iqual = clip_nonnegative(line->qual, args->m_qual); - stats->qual_snps[iqual]++; + int iqual = (isnan(line->qual) || line->qual<0) ? 0 : 1 + (int)(line->qual*10); #endif int i; @@ -834,7 +846,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->ts_alt1++; #if QUAL_STATS - stats->qual_ts[iqual]++; + dist_insert(stats->qual_ts,iqual); #endif do_user_stats(stats, reader, 1); } @@ -846,7 +858,7 @@ static void do_snp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader) { stats->tv_alt1++; #if QUAL_STATS - stats->qual_tv[iqual]++; + dist_insert(stats->qual_tv,iqual); #endif do_user_stats(stats, reader, 0); } @@ -1355,21 +1367,50 @@ static void print_stats(args_t *args) } } #if QUAL_STATS - printf("# QUAL, Stats by quality:\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); + printf("# QUAL, Stats by quality\n# QUAL\t[2]id\t[3]Quality\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\t[7]number of indels\n"); for (id=0; idnstats; id++) { stats_t *stats = &args->stats[id]; - for (i=0; im_qual; i++) + int ndist_ts = dist_nbins(stats->qual_ts); + int ndist_tv = dist_nbins(stats->qual_tv); + int ndist_in = dist_nbins(stats->qual_indels); + int ndist_max = ndist_ts; + if ( ndist_max < ndist_tv ) ndist_max = ndist_tv; + if ( ndist_max < ndist_in ) ndist_max = ndist_in; + uint32_t beg, end; + uint32_t nts, ntv, nin; + for (i=0; iqual_snps[i]+stats->qual_ts[i]+stats->qual_tv[i]+stats->qual_indels[i] == 0 ) continue; - printf("QUAL\t%d\t%d\t%d\t%d\t%d\t%d\n", id,i,stats->qual_snps[i],stats->qual_ts[i],stats->qual_tv[i],stats->qual_indels[i]); + nts = ntv = nin = 0; + float qval = -1; + if ( i < ndist_ts ) + { + nts = dist_get(stats->qual_ts, i, &beg, &end); + qval = beg>0 ? 0.1*(beg - 1) : -1; + } + if ( i < ndist_tv ) + { + ntv = dist_get(stats->qual_tv, i, &beg, &end); + if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; + } + if ( i < ndist_in ) + { + nin = dist_get(stats->qual_indels, i, &beg, &end); + if ( qval==-1 ) qval = beg > 0 ? 0.1*(beg - 1) : -1; + } + if ( nts+ntv+nin==0 ) continue; + + printf("QUAL\t%d\t",id); + if ( qval==-1 ) printf("."); + else printf("%.1f",qval); + printf("\t%d\t%d\t%d\t%d\n",nts+ntv,nts,ntv,nin); } } #endif for (i=0; inusr; i++) { - printf("# USR:%s, Stats by %s:\n# USR:%s\t[2]id\t[3]%s\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", - args->usr[i].tag,args->usr[i].tag,args->usr[i].tag,args->usr[i].tag); + printf("# USR:%s/%d\t[2]id\t[3]%s/%d\t[4]number of SNPs\t[5]number of transitions (1st ALT)\t[6]number of transversions (1st ALT)\n", + args->usr[i].tag,args->usr[i].idx,args->usr[i].tag,args->usr[i].idx); for (id=0; idnstats; id++) { user_stats_t *usr = &args->stats[id].usr[i]; @@ -1378,8 +1419,8 @@ static void print_stats(args_t *args) { if ( usr->vals_ts[j]+usr->vals_tv[j] == 0 ) continue; // skip empty bins float val = usr->min + (usr->max - usr->min)*j/(usr->nbins-1); - const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s\t%d\t%.0f\t%d\t%d\t%d\n"; - printf(fmt,usr->tag,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); + const char *fmt = usr->type==BCF_HT_REAL ? "USR:%s/%d\t%d\t%e\t%d\t%d\t%d\n" : "USR:%s/%d\t%d\t%.0f\t%d\t%d\t%d\n"; + printf(fmt,usr->tag,usr->idx,id,val,usr->vals_ts[j]+usr->vals_tv[j],usr->vals_ts[j],usr->vals_tv[j]); } } } @@ -1705,26 +1746,27 @@ static void usage(void) fprintf(stderr, "Usage: bcftools stats [options] []\n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " --af-bins allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); - fprintf(stderr, " --af-tag allele frequency tag to use, by default estimated from AN,AC or GT\n"); - fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n"); - fprintf(stderr, " -c, --collapse treat as identical records with , see man page for details [none]\n"); - fprintf(stderr, " -d, --depth depth distribution: min,max,bin size [0,500,1]\n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -E, --exons tab-delimited file with exons for indel frameshifts (chr,from,to; 1-based, inclusive, bgzip compressed)\n"); - fprintf(stderr, " -f, --apply-filters require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); - fprintf(stderr, " -F, --fasta-ref faidx indexed reference sequence file to determine INDEL context\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -I, --split-by-ID collect stats for sites with ID separately (known vs novel)\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples list of samples for sample stats, \"-\" to include all samples\n"); - fprintf(stderr, " -S, --samples-file file of samples to include\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " -u, --user-tstv collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n"); + fprintf(stderr, " --af-bins LIST Allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n"); + fprintf(stderr, " --af-tag STRING Allele frequency tag to use, by default estimated from AN,AC or GT\n"); + fprintf(stderr, " -1, --1st-allele-only Include only 1st allele at multiallelic sites\n"); + fprintf(stderr, " -c, --collapse STRING Treat as identical records with , see man page for details [none]\n"); + fprintf(stderr, " -d, --depth INT,INT,INT Depth distribution: min,max,bin size [0,500,1]\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -E, --exons FILE.gz Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, bgzip compressed)\n"); + fprintf(stderr, " -f, --apply-filters LIST Require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n"); + fprintf(stderr, " -F, --fasta-ref FILE Faidx indexed reference sequence file to determine INDEL context\n"); + fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --split-by-ID Collect stats for sites with ID separately (known vs novel)\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " -s, --samples LIST List of samples for sample stats, \"-\" to include all samples\n"); + fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " -u, --user-tstv TAG[:min:max:n] Collect Ts/Tv stats for any tag using the given binning [0:1:100]\n"); + fprintf(stderr, " A subfield can be selected as e.g. 'PV4[0]', here the first value of the PV4 tag\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -v, --verbose Produce verbose per-site and per-sample output\n"); fprintf(stderr, "\n"); exit(1); } From 9718479aca7b7250141778c6ad13a20029e19a45 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 25 Nov 2020 10:12:29 +0000 Subject: [PATCH 22/81] Add new optional tag by `mpileup -a FORMAT/QS` --- bam2bcf.c | 37 +++++++++++++++++++++++++++---------- bam2bcf.h | 10 +++++----- mpileup.c | 12 ++++++++++-- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index d080917aa..9087b99f5 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -1,7 +1,7 @@ /* bam2bcf.c -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2014 Genome Research Ltd. + Copyright (C) 2012-2020 Genome Research Ltd. Author: Heng Li @@ -126,13 +126,14 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES); if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1)); + memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES); } /* Notes: - - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies - which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation. - Later it's used for multiallelic calling by bcftools -m + - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.QS frequencies + which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the INFO/QS and FMT/QS annotations. + Later it's used for multiallelic calling by `call -m`, `call -mG` and `+trio-dnm`. - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel. */ /* @@ -150,7 +151,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // clean from previous run r->ori_depth = 0; r->mq0 = 0; - memset(r->qsum,0,sizeof(float)*4); memset(r->anno,0,sizeof(double)*16); memset(r->p,0,sizeof(float)*25); r->SCR = 0; @@ -205,7 +205,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // collect annotations if (b < 4) { - r->qsum[b] += q; + r->QS[b] += q; if ( r->ADF ) { if ( bam_is_rev(p->b) ) @@ -558,7 +558,7 @@ void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call) int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j; - float qsum[5] = {0,0,0,0,0}; + float qsum[B2B_MAX_ALLELES] = {0,0,0,0,0}; if (ref_base >= 0) { call->ori_ref = ref4 = seq_nt16_int[ref_base]; if (ref4 > 4) ref4 = 4; @@ -569,9 +569,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int for (i = 0; i < n; ++i) { float sum = 0; - for (j = 0; j < 4; ++j) sum += calls[i].qsum[j]; + for (j = 0; j < 4; ++j) sum += calls[i].QS[j]; if ( sum ) - for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum; + for (j = 0; j < 4; j++) qsum[j] += (float)calls[i].QS[j] / sum; } // sort qsum in ascending order (insertion sort) @@ -583,7 +583,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // Set the reference allele and alternative allele(s) for (i=0; i<5; i++) call->a[i] = -1; - for (i=0; i<5; i++) call->qsum[i] = 0; + for (i=0; iqsum[i] = 0; call->unseen = -1; call->a[0] = ref4; for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering @@ -695,6 +695,21 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int adf += B2B_MAX_ALLELES; } } + if ( bca->fmt_flag & B2B_FMT_QS ) + { + assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well + + // reorder QS to match the allele ordering at this site + int32_t tmp[B2B_MAX_ALLELES]; + int32_t *qs = call->QS, *qs_out = call->QS; + for (i=0; in_alleles; j++) tmp[j] = qs[ call->a[j] ]; + for (j=0; jn_alleles; j++) qs_out[j] = tmp[j] < BCF_MAX_BT_INT32 ? tmp[j] : BCF_MAX_BT_INT32; + qs_out += call->n_alleles; + qs += B2B_MAX_ALLELES; + } + } // if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); call->shift = (int)(sum_min + .499); @@ -884,6 +899,8 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, } if ( fmt_flag&B2B_FMT_SCR ) bcf_update_format_int32(hdr, rec, "SCR", bc->SCR+1, rec->n_sample); + if ( fmt_flag&B2B_FMT_QS ) + bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele); return 0; } diff --git a/bam2bcf.h b/bam2bcf.h index 4b3fe72af..4019ca74a 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -1,7 +1,7 @@ /* bam2bcf.h -- variant calling. Copyright (C) 2010-2012 Broad Institute. - Copyright (C) 2012-2014,2016 Genome Research Ltd. + Copyright (C) 2012-2020 Genome Research Ltd. Author: Heng Li @@ -59,6 +59,7 @@ DEALINGS IN THE SOFTWARE. */ #define B2B_FMT_SCR (1<<13) #define B2B_INFO_VDB (1<<14) #define B2B_INFO_RPB (1<<15) +#define B2B_FMT_QS (1<<16) #define B2B_MAX_ALLELES 5 @@ -89,8 +90,7 @@ typedef struct __bcf_callaux_t { typedef struct { uint32_t ori_depth; // ori_depth = anno[0..3] but before --min-BQ is applied unsigned int mq0; - int32_t *ADF, *ADR, SCR; - float qsum[4]; + int32_t *ADF, *ADR, SCR, *QS; // FMT/QS // The fields are: // depth fwd .. ref (0) and non-ref (2) // depth rev .. ref (1) and non-ref (3) @@ -112,12 +112,12 @@ typedef struct { int tid, pos; bcf_hdr_t *bcf_hdr; int a[5]; // alleles: ref, alt, alt2, alt3 - float qsum[5]; // for the QS tag + float qsum[B2B_MAX_ALLELES]; // INFO/QS tag int n, n_alleles, shift, ori_ref, unseen; int n_supp; // number of supporting non-reference reads double anno[16]; unsigned int depth, ori_depth, mq0; - int32_t *PL, *DP4, *ADR, *ADF, *SCR; + int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS; uint8_t *fmt_arr; float vdb; // variant distance bias float mwu_pos, mwu_mq, mwu_bq, mwu_mqs; diff --git a/mpileup.c b/mpileup.c index 28bddfa54..324ef577f 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2018 Genome Research Ltd. + Copyright (C) 2008-2020 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -580,6 +580,8 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_FMT_QS ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_ADF ) @@ -610,6 +612,9 @@ static int mpileup(mplp_conf_t *conf) conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); + conf->bc.QS = (int32_t*) malloc(nsmpl*sizeof(*conf->bc.QS)*B2B_MAX_ALLELES); + for (i=0; ibcr[i].QS = conf->bc.QS + i*B2B_MAX_ALLELES; if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); @@ -694,6 +699,7 @@ static int mpileup(mplp_conf_t *conf) free(conf->bc.ADR); free(conf->bc.ADF); free(conf->bc.SCR); + free(conf->bc.QS); free(conf->bc.fmt_arr); free(conf->bcr); } @@ -797,6 +803,7 @@ int parse_format_flag(const char *str) else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR; + else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS; else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR; else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; @@ -825,6 +832,7 @@ static void list_annotations(FILE *fp) " FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n" " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n" " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n" +" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n" " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n" " FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n" "\n" @@ -881,7 +889,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -x, --ignore-overlaps disable read-pair overlap detection\n" "\n" "Output options:\n" -" -a, --annotate LIST optional tags to output; '?' to list []\n" +" -a, --annotate LIST optional tags to output; '?' to list available tags []\n" " -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n" " to minimum per-sample DP\n" " --no-version do not append version and command line to the header\n" From 07c4308f4bdea2f40d7691ba79f37601efe89ce9 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 25 Nov 2020 16:00:46 +0000 Subject: [PATCH 23/81] [NEWS] Major revamp of `bcftools +trio-dnm` The original trio-dnm calling model used genotype likelihoods (PLs) as the input for calling. However, that is flawed because PLs make assumptions which are unsuitable for de novo calling: PL(RR) can become bigger than PL(RA) even when the ALT allele is present in the parents. Note that this is true also for other programs such as DeNovoGear which rely on the same samtools calculation. The new recommended workflow is bcftools mpileup -a AD,QS -f ref.fa -Ou proband.bam father.bam mother.bam | bcftools call -mv -Ou | bcftools +trio-dnm -p proband,father,mother -Oz -o output.vcf.gz This commit also implements the DeNovoGear model. The original behavior of trio-dnm is no longer supported. For more details see http://samtools.github.io/bcftools/trio-dnm.pdf --- plugins/trio-dnm.c | 865 ++++++++++++++++++++++++++++++--- plugins/trio-stats.c | 6 +- test/test.pl | 16 +- test/trio-dnm.1.out | 18 - test/trio-dnm.2.out | 18 - test/trio-dnm/trio-dnm.1.out | 18 + test/trio-dnm/trio-dnm.1.vcf | 31 ++ test/trio-dnm/trio-dnm.2.vcf | 31 ++ test/trio-dnm/trio-dnm.4.1.out | 1 + test/trio-dnm/trio-dnm.4.2.out | 1 + test/trio-dnm/trio-dnm.4.vcf | 10 + test/trio-dnm/trio-dnm.5.1.out | 1 + test/trio-dnm/trio-dnm.5.vcf | 10 + test/trio-dnm/trio-dnm.6.1.out | 1 + test/trio-dnm/trio-dnm.6.2.out | 1 + test/trio-dnm/trio-dnm.6.vcf | 11 + test/trio-dnm/trio-dnm.7.1.out | 1 + test/trio-dnm/trio-dnm.7.vcf | 11 + test/trio-dnm/trio-dnm.8.vcf | 11 + test/trio-dnm/trio-dnm.9.vcf | 11 + 20 files changed, 967 insertions(+), 106 deletions(-) delete mode 100644 test/trio-dnm.1.out delete mode 100644 test/trio-dnm.2.out create mode 100644 test/trio-dnm/trio-dnm.1.out create mode 100644 test/trio-dnm/trio-dnm.1.vcf create mode 100644 test/trio-dnm/trio-dnm.2.vcf create mode 100644 test/trio-dnm/trio-dnm.4.1.out create mode 100644 test/trio-dnm/trio-dnm.4.2.out create mode 100644 test/trio-dnm/trio-dnm.4.vcf create mode 100644 test/trio-dnm/trio-dnm.5.1.out create mode 100644 test/trio-dnm/trio-dnm.5.vcf create mode 100644 test/trio-dnm/trio-dnm.6.1.out create mode 100644 test/trio-dnm/trio-dnm.6.2.out create mode 100644 test/trio-dnm/trio-dnm.6.vcf create mode 100644 test/trio-dnm/trio-dnm.7.1.out create mode 100644 test/trio-dnm/trio-dnm.7.vcf create mode 100644 test/trio-dnm/trio-dnm.8.vcf create mode 100644 test/trio-dnm/trio-dnm.9.vcf diff --git a/plugins/trio-dnm.c b/plugins/trio-dnm.c index 548a814bf..e127ecac2 100644 --- a/plugins/trio-dnm.c +++ b/plugins/trio-dnm.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018-2019 Genome Research Ltd. + Copyright (c) 2018-2020 Genome Research Ltd. Author: Petr Danecek @@ -35,45 +35,71 @@ #include #include #include +#include #include #include +#include #include #include "bcftools.h" +#include "regidx.h" #include "filter.h" +#define USE_DNG 1 // DeNovoGear model +#define USE_ACM 2 // the new "allele-centric model" which combines fixed DNG priors with allele centric approach // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 #define FLT_EXCLUDE 2 -#define iCHILD 0 -#define iFATHER 1 -#define iMOTHER 2 +#define iFATHER 0 // don't modify, QS calculations depend on this order! +#define iMOTHER 1 +#define iCHILD 2 typedef struct { int idx[3]; // VCF sample index for child, father, mother - int pass; // do all three pass the filters? + int pass, // do all three pass the filters? + is_male; // male pattern of chrX inheritance? } trio_t; +typedef struct +{ + // combines priors, mutation rates, genotype transmission probability; see init_priors() + double pprob[10][10][10]; // prior probability; the order is father,mother,child + uint8_t denovo[10][10][10]; // is the GT combination not compatible with normal inheritence (0) or is de novo (1) +} +priors_t; + typedef struct { int argc, filter_logic, regions_is_file, targets_is_file, output_type; char *filter_str; + filter_t *filter; char **argv, *ped_fname, *pfm, *output_fname, *fname, *regions, *targets; htsFile *out_fh; bcf_srs_t *sr; bcf_hdr_t *hdr, *hdr_out; + char *chrX_list_str; + regidx_t *chrX_idx; trio_t *trio; int has_fmt_ad; int ntrio, mtrio; - int32_t *pl, *ad, *dnm_qual, *vaf; // input FMT/PL and AD values, output DNM and VAF - int mpl, mad; + int32_t *pl, *ad, *qs, *dnm_qual_int, *vaf; // input FMT/PL, AD, QS values, output DNM and VAF + float *dnm_qual_float; + int mpl, mad, mqs; double min_score; double *aprob; // proband's allele probabilities - double *pl3; // normalized PLs converted to probs for proband,father,mother - int maprob, mpl3, midx, *idx, force_ad; + double *pl3; // normalized PLs converted to probs for iFATHER,iMOTHER,iCHILD + double *qs3; // QS converted to probs for iFATHER,iMOTHER,iCHILD + int maprob, mpl3, mqs3, midx, *idx, force_ad, use_model; + char *dnm_score_tag; // the argument of --use tag, by default DNM:int + int dnm_score_is_float; // given by e.g. --use tag DNM:float + double mrate; // --use mrate, mutation rate + double pnoise_abs,pnoise_frac; // --use pn|pnoise + int use_ppl, use_ppl_qs; // --use ppl and --use ppl-qs + int use_dng_priors; // --use dng-priors + priors_t priors, priors_X, priors_XX; } args_t; @@ -91,18 +117,28 @@ static const char *usage_text(void) "About: Screen variants for possible de-novo mutations in trios\n" "Usage: bcftools +trio-dnm [Plugin Options]\n" "Plugin options:\n" - " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" + " -e, --exclude EXPR exclude trios for which the expression is true (one matching sample invalidates a trio)\n" " --force-AD calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" - " -i, --include EXPR include sites and samples for which the expression is true\n" + " -i, --include EXPR include trios for which the expression is true (one failing samples invalidates a trio)\n" " -m, --min-score NUM do not add FMT/DNM annotation if the score is smaller than NUM\n" " -o, --output FILE output file name [stdout]\n" " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" - " -p, --pfm P,F,M sample names of proband, father, and mother\n" - " -P, --ped FILE PED file\n" + " -p, --pfm [1X:|2X:]P,F,M sample names of child (the proband), father, mother; \"1X:\" for male pattern of chrX inheritance [2X:]\n" + " -P, --ped FILE PED file with the columns: ,proband,father,mother,sex(1:male,2:female)\n" " -r, --regions REG restrict to comma-separated list of regions\n" " -R, --regions-file FILE restrict to regions listed in a file\n" " -t, --targets REG similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " -u, --use OPTION[=VALUE] various options to tweak:\n" + " DNG use the original DeNovoGear model\n" + " dng-priors use the original DeNovoGear priors\n" + " mrate=NUM mutation rate for DNG and AC-DNG models [-u mrate=1e-8]\n" + " pn|pnoise=FRAC[,NUM] noise tolerance (or mosaicity) in parents, given as fraction of QS or number of reads [-u pn=0.045,0]\n" + " ppl use parental genotype likelihoods (FMT/PL rather than FMT/QS)\n" + " tag=TAG[:phred|log] annotation to add, either as phred quality (int) or log-scaled (float) [-u tag=DNM:phred]\n" + " -X, --chrX LIST regions with the chr X inheritance pattern or one of the predefined lists, exclude PARs [GRCh37]\n" + " GRCh37 .. X:1-60000,chrX:1-60000,X:2699521-154931043,chrX:2699521-154931043\n" + " GRCh38 .. X:1-9999,chrX:1-9999,X:2781480-155701381,chrX:2781480-155701381\n" "\n" "Example:\n" " # Annotate VCF with FORMAT/DNM, run for a single trio\n" @@ -113,6 +149,13 @@ static const char *usage_text(void) "\n" " # Same as above plus extract a list of significant DNMs using the bcftools/query command\n" " bcftools +trio-dnm -P file.ped file.bcf -Ou | bcftools query -i'DNM>10' -f'[%CHROM:%POS %SAMPLE %DNM\\n]'\n" + "\n" + " # A complete example with a variant calling step. Note that this is one long\n" + " # command and should be on a single line. Also note that a filtering step is\n" + " # recommended, e.g. by depth and VAF (not shown here):\n" + " bcftools mpileup -a AD,QS -f ref.fa -Ou proband.bam father.bam mother.bam |\n" + " bcftools call -mv -Ou |\n" + " bcftools +trio-dnm -p proband,father,mother -Oz -o output.vcf.gz\n" "\n"; } @@ -154,12 +197,22 @@ static void parse_ped(args_t *args, char *fname) int child = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,&str.s[off[1]]); if ( child<0 ) continue; + int sex = 0; + if ( ncols>=5 ) + { + char *tmp; + sex = strtol(&str.s[off[4]],&tmp,10); + if ( tmp==&str.s[off[4]] || *tmp ) error("Could not parse the PED file, the 5th column should be numeric: %s\n",str.s); + if ( sex!=1 && sex!=2 ) sex = 0; + } + args->ntrio++; hts_expand0(trio_t,args->ntrio,args->mtrio,args->trio); trio_t *trio = &args->trio[args->ntrio-1]; trio->idx[iFATHER] = father; trio->idx[iMOTHER] = mother; trio->idx[iCHILD] = child; + trio->is_male = sex==1 ? 1 : 0; } while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); @@ -172,8 +225,345 @@ static void parse_ped(args_t *args, char *fname) free(off); if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); } + + +static const uint8_t seq1[10] = {0,1,1,2,2,2,3,3,3,3}; +static const uint8_t seq2[10] = {0,0,1,0,1,2,0,1,2,3}; +typedef enum { include_ref, only_alts } count_unique_t; +static int count_unique_alleles(int ngt, int gt[3], count_unique_t count) +{ + int i, als[4] = {0,0,0,0}; + for (i=0; i3 ) // 4 different alleles in the trio + gt_prior = 1e-26; + else if ( nals_mf>=3 ) // 3 different alleles in parents, + gt_prior = 0.002 * 0.002 / 414; // split equally amongst all triallelic cases + else if ( nals_mfc==3 ) // 3rd allele in the child + gt_prior = 1e-3 * 1e-3; // This is what g_PolyRate evaluates in DNG code + else if ( nref_mf==4 ) + gt_prior = 0.995 * 0.998; // 4 copies of ref in parents + else if ( nref_mf==3 ) + gt_prior = 0.995 * 0.002 * (3.0/5.0) * (4.0/5.0) * 0.5; // 3 copies of ref in parents + else if ( nref_mf==2 && fa==fb && ma==mb ) + gt_prior = 0.995 * 0.002 * (2.0/5.0) * (1.0/5.0) * 0.5; // 2 copies of ref in parents, both homs + else if ( nref_mf==2 ) + gt_prior = 0.995 * 0.002 * (2.0/5.0) * (2.0/5.0); // 2 copies of ref in parents, both hets + else if ( nref_mf==1 ) + { + assert( nals_mf==2 && nals_mfc==2 ); + gt_prior = 0.995 * 0.002 * (2.0/5.0) * (2.0/5.0) * 0.5; // 1 copy of ref in parents + } + else if ( nref_mf==0 ) + { + if ( nals_mf==1 ) + gt_prior = 0.995 * 0.002 * (3.0/5.0) * (1.0/5.0); // 1 alt allele in the trio + else if ( nals_mf==2 ) + { + assert( ca!=0 && cb!=0 ); + gt_prior = 0.002 * 0.002 / 414; // 2 alt alleles and 0 refs in the trio + } + else + error("Fixme: %s:%d\n",__FILE__,__LINE__); + } + else + error("Fixme: %s:%d\n",__FILE__,__LINE__); + return gt_prior; +} +// Parent genotype probability L(GM,GF), with DNG bugs fixed +static double init_mf_priors(args_t *args, int fi, int mi) +{ + double gt_prior = 0; // parent genotype probability L(GM,GF) + int fa = seq1[fi]; + int fb = seq2[fi]; + int ma = seq1[mi]; + int mb = seq2[mi]; + int gts[3]; gts[0] = fi; gts[1] = mi; gts[2] = 0; + int nalt_mf = count_unique_alleles(2,gts,only_alts); + int nref_mf = (fa==0 ? 1 : 0) + (fb==0 ? 1 : 0) + (ma==0 ? 1 : 0) + (mb==0 ? 1 : 0); + + const double p_homref = 0.998; // this assumes bi-allelic sites + const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occuring twice for a different allele + const double p_nonref = 1 - p_homref - p_poly; + + if ( nalt_mf>=3 ) // penalize heavily sites with 3 unique ALTs + gt_prior = 1e-26; + else if ( nalt_mf>=2 ) // 2 unique ALTs, 19*3 = 57 cases + gt_prior = p_poly / 57.; + else if ( nref_mf==4 ) // 0 ALTs; 00,00 + gt_prior = p_homref; + else if ( nref_mf==3 ) // this and all remaining have 1 unique ALT allele; 00,0x + gt_prior = p_nonref * (4.0/15.0) * (1.0/3.0); + else if ( nref_mf==2 && ma==mb ) // hom alt; 00,xx + gt_prior = p_nonref * (2.0/15.0) * (1.0/3.0); + else if ( nref_mf==2 ) // two hets; 0x,0x + gt_prior = p_nonref * (4.0/15.0) * (1.0/3.0); + else if ( nref_mf==1 ) // single ref; 0x,xx + gt_prior = p_nonref * (4.0/15.0) * (1.0/3.0); + else if ( nref_mf==0 ) // no ref; xx,xx + gt_prior = p_nonref * (1.0/15.0) * (1.0/3.0); + else + error("Fixme: %s:%d\n",__FILE__,__LINE__); + return gt_prior; +} +static double init_mf_priors_chrX(args_t *args, int mi) +{ + double gt_prior = 0; // parent genotype probability L(GM) + int ma = seq1[mi]; + int mb = seq2[mi]; + int gts[3]; gts[0] = mi; gts[1] = 0; gts[2] = 0; + int nalt_m = count_unique_alleles(1,gts,only_alts); + int nref_m = (ma==0 ? 1 : 0) + (mb==0 ? 1 : 0); + + const double p_homref = 0.999; // this assumes bi-allelic sites + const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occuring twice for a different allele + const double p_nonref = 1 - p_homref - p_poly; + + if ( nalt_m>=2 ) // 2 unique ALTs, 3 cases + gt_prior = p_poly / 3.; + else if ( nref_m==2 ) // 00 + gt_prior = p_homref; + else if ( nref_m==1 ) // single ref; 0x and x0 + gt_prior = p_nonref * (2.0/3.0) * (1.0/3.0); + else if ( nref_m==0 ) // no ref; xx,xx + gt_prior = p_nonref * (1.0/3.0) * (1.0/3.0); + else + error("Fixme: %s:%d\n",__FILE__,__LINE__); + return gt_prior; +} +static double init_mf_priors_chrXX(args_t *args, int fi, int mi) +{ + double gt_prior = 0; // parent genotype probability L(GM) + int fa = seq1[fi]; + int fb = seq2[fi]; + int ma = seq1[mi]; + int mb = seq2[mi]; + int gts[3]; gts[0] = fi; gts[1] = mi; gts[2] = 0; + int nalt_mf = count_unique_alleles(2,gts,only_alts); + int nref_mf = (fa==0 ? 1 : 0) + (fb==0 ? 1 : 0) + (ma==0 ? 1 : 0) + (mb==0 ? 1 : 0); + if ( fa!=fb ) return 0; // father can't be a het + if ( fa==0 ) nref_mf--; + else nalt_mf--; + + const double p_homref = 0.998; // this assumes bi-allelic sites + const double p_poly = (1 - p_homref) * (1 - p_homref); // p of this occuring twice for a different allele + const double p_nonref = 1 - p_homref - p_poly; + + if ( nalt_mf>=3 ) // 3 unique ALTs + gt_prior = 1e-26; + else if ( nalt_mf>=2 ) // 2 unique ALTs + gt_prior = p_poly * (1.0/9.0) * (1.0/3.0); + else if ( nref_mf==3 ) // 00,0 + gt_prior = p_homref; + else if ( nref_mf==2 ) // 00,x; 0x,0; x0,0 + gt_prior = p_nonref * (3.0/7.0) * (1.0/3.0); + else if ( nref_mf==1 ) // 0x,x; x0,x; xx,0 + gt_prior = p_nonref * (3.0/7.0) * (1.0/3.0); + else if ( nref_mf==0 ) // no ref; xx,x + gt_prior = p_nonref * (1.0/7.0) * (1.0/3.0); + else + error("Fixme: %s:%d\n",__FILE__,__LINE__); + return gt_prior; +} +static void init_DNG_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob) +{ + int fa = seq1[fi]; + int fb = seq2[fi]; + int ma = seq1[mi]; + int mb = seq2[mi]; + int gts[3]; gts[0] = fi; gts[1] = mi; gts[2] = 0; + int ca = seq1[ci]; + int cb = seq2[ci]; + gts[0] = fi; gts[1] = mi; gts[2] = ci; + int nals_mfc = count_unique_alleles(3,gts,include_ref); + *tprob = 1; // genotype transmission likelihood L(GC|GM,GF), 0 if not compatible with Mendelian inheritance + *mprob = 1 - args->mrate; // probability of mutation + + if ( nals_mfc==4 ) + *tprob = 0; // 4 unique alleles + else if ( nals_mfc==3 ) // 3 alleles + { + if ( ((ca==fa || ca==fb) && (cb==ma || cb==mb)) || + ((cb==fa || cb==fb) && (ca==ma || ca==mb)) ) + { + if ( ca==cb ) *tprob = 0.25; + else if ( fa==fb || ma==mb ) *tprob = 0.5; // one parent is homozygous + else *tprob = 0.25; + } + else + { + if ( ca!=fa && ca!=fb && ca!=ma && ca!=mb && + cb!=fa && cb!=fb && cb!=ma && cb!=mb ) *mprob = args->mrate * args->mrate; // two mutations + else + *mprob = args->mrate; + *tprob = 0; + } + } + else if ( nals_mfc==2 ) // 2 alleles + { + if ( fa!=fb && ma!=mb ) *tprob = 0.25; // both parents are hets + else if ( fa==fb && ma==mb ) // both parents are homs + { + if ( fa==ma && ca==cb ) *tprob = 0, *mprob = args->mrate * args->mrate; // parents same homs, child a hom, two alleles mutated + else if ( fa==ma ) *tprob = 0, *mprob = args->mrate; // parents same homs, child a het, one allele mutated + else if ( ca==cb ) *tprob = 0, *mprob = args->mrate; // parents diff homs, child a hom, one allele mutated + } + else if ( ca==cb && ((fa==fb && fa!=ca) || (ma==mb && ma!=ca)) ) + *tprob = 0, *mprob = args->mrate; // child is (wrong) hom and one parent is hom + else + *tprob = 0.5; + } +} +static void init_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob) +{ + int fa = seq1[fi]; + int fb = seq2[fi]; + int ma = seq1[mi]; + int mb = seq2[mi]; + int ca = seq1[ci]; + int cb = seq2[ci]; + + // tprob .. genotype transmission probability L(GC|GM,GF), 0 if not compatible with Mendelian inheritance + // mprob .. probability of mutation + + if ( ((ca==fa||ca==fb) && (cb==ma||cb==mb)) || ((ca==ma||ca==mb) && (cb==fa||cb==fb)) ) + { + if ( fa==fb && ma==mb ) *tprob = 1; + else if ( fa==fb || ma==mb ) *tprob = 0.5; + else *tprob = 0.25; + *mprob = 1 - args->mrate; + } + else + { + *tprob = 0; + if ( (ca==fa||ca==fb) || (ca==ma||ca==mb) || (cb==fa||cb==fb) || (cb==ma||cb==mb) ) *mprob = args->mrate; + else *mprob = args->mrate * args->mrate; + } +} +static void init_tprob_mprob_chrX(args_t *args, int mi, int ci, double *tprob, double *mprob) +{ + int ma = seq1[mi]; + int mb = seq2[mi]; + int ca = seq1[ci]; + int cb = seq2[ci]; + + if ( ca!=cb ) // male cannot be heterozygous in X + *mprob = 0, *tprob = 0; + else if ( ca==ma || ca==mb ) // inherited + { + if ( ma==mb ) *tprob = 1; + else *tprob = 0.5; + *mprob = 1 - args->mrate; + } + else // de novo + *mprob = args->mrate, *tprob = 0; +} +static void init_tprob_mprob_chrXX(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob) +{ + int fa = seq1[fi]; + int fb = seq2[fi]; + int ma = seq1[mi]; + int mb = seq2[mi]; + int ca = seq1[ci]; + int cb = seq2[ci]; + + if ( fa!=fb ) // father cannot be heterozygous in X + *mprob = 0, *tprob = 0; + else if ( (ca==fa && (cb==ma||cb==mb)) || (cb==fa && (ca==ma||ca==mb)) ) + { + if ( ma==mb ) *tprob = 1; + else *tprob = 0.5; + *mprob = 1 - args->mrate; + } + else + { + *tprob = 0; + if ( (ca==fa || (ca==ma||ca==mb)) || (cb==fa || (cb==ma||cb==mb)) ) *mprob = args->mrate, *tprob = 0; + else *mprob = args->mrate * args->mrate; + } +} +typedef enum { autosomal, chrX, chrXX } init_priors_t; +static void init_priors(args_t *args, priors_t *priors, init_priors_t type) +{ + // Based on the FIGL model from the supplement "Variation in genome-wide mutation rates within and between human families" + int fi,mi,ci; + for (fi=0; fi<10; fi++) + { + for (mi=0; mi<10; mi++) + { + for (ci=0; ci<10; ci++) + { + double gt_prior; // parent genotype probability L(GM,GF) + double tprob; // genotype transmission likelihood L(GC|GM,GF), 0 if not compatible with Mendelian inheritance + double mprob; // probability of mutation + if ( args->use_dng_priors ) + gt_prior = init_DNG_mf_priors(args,fi,mi,ci); + else if ( type==autosomal ) + gt_prior = init_mf_priors(args,fi,mi); + else if ( type==chrX ) + gt_prior = init_mf_priors_chrX(args,mi); + else if ( type==chrXX ) + gt_prior = init_mf_priors_chrXX(args,fi,mi); + else + error("Can't happen\n"); + + if ( args->use_dng_priors ) + init_DNG_tprob_mprob(args,fi,mi,ci,&tprob,&mprob); + else if ( type==autosomal ) + init_tprob_mprob(args,fi,mi,ci,&tprob,&mprob); + else if ( type==chrX ) + init_tprob_mprob_chrX(args,mi,ci,&tprob,&mprob); + else if ( type==chrXX ) + init_tprob_mprob_chrXX(args,fi,mi,ci,&tprob,&mprob); + else + error("Can't happen\n"); + + priors->denovo[fi][mi][ci] = tprob==0 ? 1 : 0; + priors->pprob[fi][mi][ci] = log(gt_prior * mprob * (tprob==0 ? 1 : tprob)); + } + } + } +} static void init_data(args_t *args) { + char *ptr = strchr(args->dnm_score_tag,':'); + if ( *ptr ) + { + if ( ptr==args->dnm_score_tag ) error("Error: could not parse --use tag=%s\n",ptr); + *ptr = 0; + if ( !strcasecmp(ptr+1,"log") ) args->dnm_score_is_float = 1; + else if ( strcasecmp(ptr+1,"phred") ) error("Error: the type \"%s\" is not supported --use tag\n",ptr+1); + } + args->sr = bcf_sr_init(); if ( args->regions ) { @@ -184,16 +574,29 @@ static void init_data(args_t *args) if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr = bcf_sr_get_header(args->sr,0); + if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); + int id; if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) error("Error: the tag FORMAT/PL is not present in %s\n", args->fname); + if ( (args->use_model&USE_ACM) && ((id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "QS"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id)) && !args->use_ppl ) + error( + "Error:\n" + " The FORMAT/QS tag is not present. If you want to proceed anyway, run with the `--use ppl`\n" + " option at the cost of inflated false discovery rate. The QS annotation can be generated\n" + " at the mpileup step together with the AD annotation using the command\n" + " bcftools mpileup -a AD,QS -f ref.fa file.bam\n"); // Possible future todo: use AD as a proxy for QS? if ( (id=bcf_hdr_id2int(args->hdr, BCF_DT_ID, "AD"))<0 || !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,id) ) fprintf(stderr, "Warning: the tag FORMAT/AD is not present in %s, the output tag FORMAT/VAF will not be added\n", args->fname); else args->has_fmt_ad = 1; + init_priors(args,&args->priors,autosomal); + init_priors(args,&args->priors_X,chrX); + init_priors(args,&args->priors_XX,chrXX); + args->hdr_out = bcf_hdr_dup(args->hdr); - bcf_hdr_append(args->hdr_out, "##FORMAT="); + bcf_hdr_printf(args->hdr_out, "##FORMAT=",args->dnm_score_tag,args->dnm_score_is_float?"Float":"Integer"); if ( args->has_fmt_ad ) bcf_hdr_append(args->hdr_out, "##FORMAT="); @@ -208,6 +611,16 @@ static void init_data(args_t *args) args->trio[0].idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]); args->trio[0].idx[iFATHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[1]); args->trio[0].idx[iMOTHER] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[2]); + if ( args->trio[0].idx[iCHILD] < 0 ) + { + if ( strlen(list[0])>3 && !strncasecmp(list[0],"1X:",3) ) + { + args->trio[0].idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]+3); + args->trio[0].is_male = 1; + } + else if ( strlen(list[0])>3 && !strncasecmp(list[0],"2X:",3) ) + args->trio[0].idx[iCHILD] = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, list[0]+3); + } for (i=0; itrio[0].idx[i] < 0 ) error("The sample is not present: %s\n", list[i]); @@ -220,81 +633,284 @@ static void init_data(args_t *args) parse_ped(args,args->ped_fname); if ( !args->ntrio ) error("No complete trio present\n"); } + if ( !args->chrX_list_str || !strcasecmp("GRCh37",args->chrX_list_str) ) + args->chrX_list_str = "X:1-60000,chrX:1-60000,X:2699521-154931043,chrX:2699521-154931043"; + else if ( !strcasecmp("GRCh38",args->chrX_list_str) ) + args->chrX_list_str = "X:1-9999,chrX:1-9999,X:2781480-155701381,chrX:2781480-155701381"; + char *rmme = strdup(args->chrX_list_str), *tmp = rmme; + while ( *tmp ) + { + if ( *tmp==',' ) *tmp = '\n'; + tmp++; + } + args->chrX_idx = regidx_init_string(rmme, regidx_parse_reg, NULL, 0, NULL); + free(rmme); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); - args->dnm_qual = (int32_t*) malloc(sizeof(*args->dnm_qual)*bcf_hdr_nsamples(args->hdr)); - args->vaf = (int32_t*) malloc(sizeof(*args->vaf)*bcf_hdr_nsamples(args->hdr)); + if ( args->dnm_score_is_float ) + args->dnm_qual_float = (float*) malloc(sizeof(*args->dnm_qual_float)*bcf_hdr_nsamples(args->hdr)); + else + args->dnm_qual_int = (int32_t*) malloc(sizeof(*args->dnm_qual_int)*bcf_hdr_nsamples(args->hdr)); + args->vaf = (int32_t*) malloc(sizeof(*args->vaf)*bcf_hdr_nsamples(args->hdr)); } static void destroy_data(args_t *args) { + if ( args->filter ) filter_destroy(args->filter); + regidx_destroy(args->chrX_idx); + free(args->dnm_score_tag); free(args->pl3); free(args->aprob); free(args->idx); - free(args->dnm_qual); + free(args->dnm_qual_int); + free(args->dnm_qual_float); free(args->vaf); free(args->trio); free(args->pl); free(args->ad); + free(args->qs); + free(args->qs3); if ( hts_close(args->out_fh)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); bcf_hdr_destroy(args->hdr_out); bcf_sr_destroy(args->sr); free(args); } -static float process_trio(args_t *args, int nals, double *pl[3], int npl, int *al0, int *al1) + +static inline double phred2num(double phred) +{ + return pow(10,-0.1*phred); +} +static inline double log2phred(double num) +{ + return fabs(4.3429 * num); +} +static inline double phred2log(double phred) +{ + return -phred/4.3429; +} +static inline double subtract_num_log(double a_num, double b_log) +{ + return log(a_num - exp(b_log)); +} +static inline double subtract_log(double a_log, double b_log) { - assert( nals>1 ); + if ( b_log==-HUGE_VAL ) return a_log; + return log(exp(a_log - b_log) - 1) + b_log; +} +static inline double sum_log(double a, double b) // log(exp(a)+exp(b)) +{ + if ( a==-HUGE_VAL && b==-HUGE_VAL ) return -HUGE_VAL; + if ( a>b ) + return log(1 + exp(b-a)) + a; + else + return log(1 + exp(a-b)) + b; +} +static double process_trio_ACM(args_t *args, priors_t *priors, int nals, double *pl[3], int npl, double *qs[3], int *al0, int *al1) +{ + assert( nals>1 && nals<=4 ); - // determine the two most likely proband's alleles - int i,j,k = 0,tmp; + *al0 = *al1 = 0; - hts_expand(int,nals,args->midx,args->idx); - hts_expand(double,nals,args->maprob,args->aprob); - for (i=0; iaprob[i] = 0; - for (i=0; iaprob[i] += pl[iCHILD][k]; - args->aprob[j] += pl[iCHILD][k]; - k++; + int cals = (1<use_ppl || args->use_ppl_qs ) fpl = pl[iFATHER][fi]; + else + { + fpl = 0; + for (i=0; i<4; i++) + { + if ( fals&(1<use_ppl || args->use_ppl_qs ) mpl = pl[iMOTHER][mi]; + else + { + mpl = 0; + for (i=0; i<4; i++) + { + if ( mals&(1<pprob[fi][mi][ci]; + sum = sum_log(sum,val); +#if DEBUG + if(val!=-HUGE_VAL) + fprintf(stderr,"m,f,c: %d%d+%d%d=%d%d dn=%d (%d,%d,%d) mpl,fpl,cpl: %+e %+e %+e \t prior:%+e \t pval=%+e sum=%+e %c\n", + mb,ma,fb,fa,cb,ca,priors->denovo[fi][mi][ci],fi,mi,ci,mpl,fpl,cpl,priors->pprob[fi][mi][ci], val,sum,(priors->denovo[fi][mi][ci] && max < val)?'*':'-'); +#endif + if ( priors->denovo[fi][mi][ci] && max < val ) + { + max = val; + *al0 = cb; + *al1 = ca; + } + mi++; + } + } + fi++; + } + } + ci++; } } +#if DEBUG + fprintf(stderr,"max=%e sum=%e ret=%e\n",max,sum,max-sum); +#endif + return log2phred(subtract_log(0,max-sum)); +} +static double process_trio_DNG(args_t *args, priors_t *priors, int nals, double *pl[3], int npl, int *al0, int *al1) +{ + assert( nals>1 && nals<=4 ); - // sort in descendent order - double *arr = args->aprob; - int *idx = args->idx; - for (i=0; i0 && arr[idx[j]] > arr[idx[j-1]]; j--) - tmp = idx[j], idx[j] = idx[j-1], idx[j-1] = tmp; - - if ( idx[0] < idx[1] ) { *al0 = idx[0]; *al1 = idx[1]; } - else { *al0 = idx[1]; *al1 = idx[0]; } - - // Calculate the probability of inheriting the 00, 01, and 11 genotype. For DNM they all will be small - int k00 = bcf_alleles2gt(idx[0],idx[0]); - int k01 = bcf_alleles2gt(idx[0],idx[1]); - int k11 = bcf_alleles2gt(idx[1],idx[1]); - double pd00 = pl[iCHILD][k00] * (pl[iFATHER][k00] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]); - double pd11 = pl[iCHILD][k11] * (pl[iFATHER][k11] + 0.5*pl[iFATHER][k01]) * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]); - double pd01 = pl[iCHILD][k01] * (pl[iFATHER][k00] * (pl[iMOTHER][k11] + 0.5*pl[iMOTHER][k01]) + pl[iFATHER][k11] * (pl[iMOTHER][k00] + 0.5*pl[iMOTHER][k01]) - + 0.5*pl[iFATHER][k01] * (pl[iMOTHER][k00] + pl[iMOTHER][k01] + pl[iMOTHER][k11])); + *al0 = *al1 = 0; - double max = pd01; - if ( max < pd00 ) max = pd00; - if ( max < pd11 ) max = pd11; - return fabs(4.3429 * log(max)); + double sum = -HUGE_VAL, max = -HUGE_VAL; + int ca,cb, fa,fb, ma,mb, ci=0; + for (ca=0; capprob[fi][mi][ci]; + sum = sum_log(val,sum); +#if DEBUG + if(val!=-HUGE_VAL) + fprintf(stderr,"m,f,c: %d%d+%d%d=%d%d dn=%d (%d,%d,%d) mpl,fpl,cpl: %+e %+e %+e \t prior:%+e \t pval=%+e sum=%+e %c\n", + mb,ma,fb,fa,cb,ca,priors->denovo[fi][mi][ci],fi,mi,ci,pl[iMOTHER][mi],pl[iFATHER][fi],pl[iCHILD][ci],priors->pprob[fi][mi][ci], val,sum,(priors->denovo[fi][mi][ci] && max < val)?'*':'-'); +#endif + if ( priors->denovo[fi][mi][ci] && max < val ) + { + max = val; + *al0 = cb; + *al1 = ca; + } + mi++; + } + } + fi++; + } + } + ci++; + } + } +#if DEBUG + fprintf(stderr,"max=%e sum=%e ret=%e\n",max,sum,max-sum); +#endif + return log2phred(subtract_log(0,max-sum)); +} +static inline void qs_to_pl(args_t *args, double *qs, int nqs, double *pl, int npl) +{ + int i,j,k = 0; + double sum = 0; + for (i=0; ifilter, rec, (const uint8_t**) &smpl_pass); + if ( args->filter_logic & FLT_EXCLUDE ) + { + if ( pass_site ) + { + if ( !smpl_pass ) return 0; // no samples, -e mode, the expression failed + pass_site = 0; + for (i=0; intrio; i++) + { + int pass_trio = 1; + for (j=0; j<3; j++) + { + int idx = args->trio[i].idx[j]; + if ( smpl_pass[idx] ) { pass_trio = 0; break; } // with -e one sample passes, the whole trio fails + } + args->trio[i].pass = pass_trio; + if ( pass_trio ) pass_site = 1; + } + return pass_site; + } + for (i=0; intrio; i++) args->trio[i].pass = 1; + return 1; + } + if ( !pass_site ) return 0; + if ( smpl_pass ) + { + pass_site = 0; + for (i=0; intrio; i++) + { + int pass_trio = 1; + for (j=0; j<3; j++) + { + int idx = args->trio[i].idx[j]; + if ( !smpl_pass[idx] ) { pass_trio = 0; break; } + } + args->trio[i].pass = pass_trio; + if ( pass_trio ) pass_site = 1; + } + return pass_site; + } + for (i=0; intrio; i++) args->trio[i].pass = 1; + return 1; } static void process_record(args_t *args, bcf1_t *rec) { - if ( rec->n_allele==1 ) + int skip_site = 0; + if ( rec->n_allele==1 || bcf_get_variant_types(rec)==VCF_REF ) skip_site = 1; + else if ( args->filter && !test_filters(args,rec) ) skip_site = 1; + if ( skip_site ) { if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); return; } + static int n_ad_warned = 0; int nret, nsmpl = bcf_hdr_nsamples(args->hdr), n_ad = args->has_fmt_ad; if ( n_ad ) @@ -315,30 +931,101 @@ static void process_record(args_t *args, bcf1_t *rec) } } } + nret = bcf_get_format_int32(args->hdr,rec,"PL",&args->pl,&args->mpl); if ( nret<=0 ) error("The FORMAT/PL tag not present at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); int npl1 = nret/nsmpl; if ( npl1!=rec->n_allele*(rec->n_allele+1)/2 ) - error("fixme: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); + error("todo: not a diploid site at %s:%"PRId64": %d alleles, %d PLs\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,rec->n_allele,npl1); hts_expand(double,3*npl1,args->mpl3,args->pl3); + + int nqs1 = 0; + if ( args->use_model&USE_ACM && !args->use_ppl ) + { + nret = bcf_get_format_int32(args->hdr,rec,"QS",&args->qs,&args->mqs); + if ( nret<0 ) error("Error: the FMT/QS tag is not available at %s:%"PRId64".\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( nret != nsmpl * rec->n_allele ) error("Error: incorrect number of FMT/QS values at %s:%"PRId64".\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + nqs1 = nret<=0 ? 0 : nret/nsmpl; + hts_expand(double,3*nqs1,args->mqs3,args->qs3); + } + + int is_chrX = 0; + if ( regidx_overlap(args->chrX_idx,bcf_seqname(args->hdr,rec),rec->pos,rec->pos+rec->rlen,NULL) ) is_chrX = 1; + int i, j, k, al0, al1, write_dnm = 0, ad_set = 0; - for (i=0; idnm_qual[i] = bcf_int32_missing; + if ( args->dnm_score_is_float ) + for (i=0; idnm_qual_float[i]); + else + for (i=0; idnm_qual_int[i] = bcf_int32_missing; for (i=0; intrio; i++) { + if ( args->filter && !args->trio[i].pass ) continue; + + // Samples can be in any other in the VCF, set PL and QS to reflect the iFATHER,iMOTHER,iCHILD indices double *ppl[3]; - for (j=0; j<3; j++) + double *pqs[3]; + for (j=0; j<3; j++) // set trio PLs { - int32_t *src = args->pl + npl1 * args->trio[i].idx[j]; + int32_t *src = args->pl + npl1 * args->trio[i].idx[j]; // j loops over iFATHER,iMOTHER,iCHILD double *dst = ppl[j] = args->pl3 + j*npl1; double sum = 0; - for (k=0; kuse_model&USE_ACM ) // set trio QS + { + for (j=0; j<3; j++) + { + int32_t *ad = (args->pnoise_abs && args->ad ) ? args->ad + n_ad * args->trio[i].idx[j] : NULL; + int32_t *qs = args->qs + nqs1 * args->trio[i].idx[j]; + double *dst = pqs[j] = args->qs3 + j*nqs1; + double noise_tolerance = 0; + if ( j!=iCHILD ) + { + double sum_qs = 0, sum_ad = 0; + for (k=0; kpnoise_frac; + if ( ad ) + { + for (k=0; kpnoise_abs * sum_qs / sum_ad ) + noise_tolerance = args->pnoise_abs * sum_qs / sum_ad; + } + } + for (k=0; k 255 ) val = 255; + dst[k] = phred2log(val); + } + } + if ( args->use_ppl_qs ) + { + qs_to_pl(args, pqs[iMOTHER], nqs1, ppl[iMOTHER], npl1); + qs_to_pl(args, pqs[iFATHER], nqs1, ppl[iFATHER], npl1); + } } - int32_t score = process_trio(args, rec->n_allele, ppl, npl1, &al0, &al1); + priors_t *priors; + if ( !is_chrX ) priors = &args->priors; + else if ( args->trio[i].is_male ) priors = &args->priors_X; + else priors = &args->priors_XX; + + double score; + if ( args->use_model==USE_ACM ) score = process_trio_ACM(args, priors, rec->n_allele, ppl, npl1, pqs, &al0, &al1); + else if ( args->use_model==USE_DNG ) score = process_trio_DNG(args, priors, rec->n_allele, ppl, npl1, &al0, &al1); + else error("Uh, this should not happen\n"); + if ( score >= args->min_score ) { write_dnm = 1; - args->dnm_qual[ args->trio[i].idx[iCHILD] ] = score; + if ( args->dnm_score_is_float ) + args->dnm_qual_float[ args->trio[i].idx[iCHILD] ] = score==HUGE_VAL ? 0 : subtract_log(0,phred2log(score)); + else + { + if ( score>255 ) score = 255; + args->dnm_qual_int[ args->trio[i].idx[iCHILD] ] = round(score); + } } if ( n_ad ) @@ -358,7 +1045,12 @@ static void process_record(args_t *args, bcf1_t *rec) } if ( write_dnm ) { - if ( bcf_update_format_int32(args->hdr_out,rec,"DNM",args->dnm_qual,nsmpl)!=0 ) + int ret; + if ( args->dnm_score_is_float ) + ret = bcf_update_format_float(args->hdr_out,rec,args->dnm_score_tag,args->dnm_qual_float,nsmpl); + else + ret = bcf_update_format_int32(args->hdr_out,rec,args->dnm_score_tag,args->dnm_qual_int,nsmpl); + if ( ret ) error("Failed to write FORMAT/DNM at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); if ( ad_set ) { @@ -369,13 +1061,56 @@ static void process_record(args_t *args, bcf1_t *rec) if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); } +static void set_option(args_t *args, char *optarg) +{ + char *tmp; + char *opt = strdup(optarg); + char *val = strchr(opt,'='); + if ( val ) { *val = 0; val++; } + if ( !strcasecmp(opt,"mrate") ) + { + if ( !val ) error("Error: expected value with -u mrate, e.g. -u mrate=1e-8\n"); + args->mrate = strtod(val,&tmp); + if ( *tmp ) error("Could not parse: -u %s\n", optarg); + } + else if ( !strcasecmp(opt,"pn") || !strcasecmp(opt,"pnoise") ) + { + if ( !val ) error("Error: expected value with -u pnoise, e.g. -u pnoise=0.05\n"); + args->pnoise_frac = strtod(val,&tmp); + if ( *tmp && *tmp==',' ) + { + args->pnoise_abs = strtod(tmp+1,&tmp); + if ( *tmp ) error("Could not parse: -u %s\n", optarg); + } + if ( args->pnoise_frac<0 || args->pnoise_frac>1 ) error("Error: expected value from the interval [0,1] for -u %s\n", optarg); + if ( args->pnoise_abs<0 ) error("Error: expected positive value for -u %s\n", optarg); + } + else if ( !strcasecmp(opt,"DNG") ) { args->use_model = USE_DNG; args->use_dng_priors = 1; } + else if ( !strcasecmp(opt,"dng-priors") ) args->use_dng_priors = 1; + else if ( !strcasecmp(opt,"ppl") ) args->use_ppl = 1; + else if ( !strcasecmp(opt,"tag") ) + { + if ( !val ) error("Error: expected value with -u tag, e.g. -u tag=ANN\n"); + free(args->dnm_score_tag); + args->dnm_score_tag = strdup(val); + } + else error("Error: the option \"-u %s\" is not recognised\n",optarg); + free(opt); +} int run(int argc, char **argv) { args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->output_fname = "-"; + args->dnm_score_tag = strdup("DNM:phred"); + args->mrate = 1e-8; + args->pnoise_frac = 0.045; + args->pnoise_abs = 0; + args->use_model = USE_ACM; static struct option loptions[] = { + {"chrX",required_argument,0,'X'}, + {"use",required_argument,0,'u'}, {"force-AD",no_argument,0,1}, {"min-score",required_argument,0,'m'}, {"include",required_argument,0,'i'}, @@ -392,10 +1127,12 @@ int run(int argc, char **argv) }; int c; char *tmp; - while ((c = getopt_long(argc, argv, "p:P:o:O:s:i:e:r:R:t:T:m:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "p:P:o:O:s:i:e:r:R:t:T:m:au:X:",loptions,NULL)) >= 0) { switch (c) { + case 'X': args->chrX_list_str = optarg; break; + case 'u': set_option(args,optarg); case 1 : args->force_ad = 1; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; @@ -416,7 +1153,7 @@ int run(int argc, char **argv) case 'P': args->ped_fname = optarg; break; case 'p': args->pfm = optarg; break; case 'm': args->min_score = strtod(optarg,&tmp); - if ( *tmp ) error("Could not parse: --min-score %s\n", optarg); + if ( *tmp ) error("Could not parse: -M, --min-score %s\n", optarg); break; case 'h': case '?': diff --git a/plugins/trio-stats.c b/plugins/trio-stats.c index aabed2a11..a15757fc8 100644 --- a/plugins/trio-stats.c +++ b/plugins/trio-stats.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018-2019 Genome Research Ltd. + Copyright (c) 2018-2020 Genome Research Ltd. Author: Petr Danecek @@ -133,8 +133,8 @@ static const char *usage_text(void) " -a, --alt-trios INT for transmission rate consider only sites with at most this\n" " many alternate trios, 0 for unlimited [0]\n" " -d, --debug TYPE comma-separted list of features: {mendel-errors,transmitted}\n" - " -e, --exclude EXPR exclude sites and samples for which the expression is true\n" - " -i, --include EXPR include sites and samples for which the expression is true\n" + " -e, --exclude EXPR exclude trios for which the expression is true (one matching sample invalidates a trio)\n" + " -i, --include EXPR include trios for which the expression is true (one failing sample invalidates a trio)\n" " -o, --output FILE output file name [stdout]\n" " -p, --ped FILE PED file\n" " -P, --pfm P,F,M sample names of proband, father, and mother\n" diff --git a/test/test.pl b/test/test.pl index d59770a3c..8fade275b 100755 --- a/test/test.pl +++ b/test/test.pl @@ -480,9 +480,18 @@ test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-a PASSOC,FASSOC,NOVELAL,NOVELGT -0 {PATH}/contrast0.txt -1 {PATH}/contrast1.txt'); test_vcf_plugin($opts,in=>'contrast.1',out=>'contrast.1.1.out',cmd=>'+contrast',args=>'-a NOVELAL,NOVELGT -0 A -1 B'); test_vcf_plugin($opts,in=>'contrast.1',out=>'contrast.1.2.out',cmd=>'+contrast',args=>'-a NOVELGT -0 A -1 B'); -test_vcf_plugin($opts,in=>'trio-dnm.1',out=>'trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'%CHROM[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm.2',out=>'trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother --force-AD | $$opts{bin}/bcftools query -f'%CHROM[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm.2',out=>'trio-dnm.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'%CHROM[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.1',out=>'trio-dnm/trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u ppl -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.2',out=>'trio-dnm/trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u ppl -u tag=DNM:log --force-AD | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss by DNG +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss, low PL +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); test_vcf_plugin($opts,in=>'gvcfz',out=>'gvcfz.1.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GT!="alt"' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%END[\\t%GT][\\t%DP][\\t%GQ][\\t%RGQ]\\n']); test_vcf_plugin($opts,in=>'gvcfz',out=>'gvcfz.2.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GQ>10; FLT:-' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%FILTER\\t%END[\\t%GT][\\t%DP][\\t%GQ][\\t%RGQ]\\n']); test_vcf_plugin($opts,in=>'gvcfz.2',out=>'gvcfz.2.1.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GT!="alt"' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%FILTER\\t%END[\\t%GT][\\t%DP]\\n']); @@ -905,6 +914,7 @@ sub bgzip_tabix { my ($opts,%args) = @_; my $file = "$args{file}.$args{suffix}"; + if ( $file=~m{/[^/]+} ) { cmd("mkdir -p $$opts{tmp}/$`"); } # create a subdirectory if necessary if ( $$opts{redo_outputs} or !-e "$$opts{tmp}/$file.gz" or is_file_newer("$$opts{path}/$file","$$opts{tmp}/$file.gz") ) { cmd("cat $$opts{path}/$file | $$opts{bgzip} -c > $$opts{tmp}/$file.gz"); diff --git a/test/trio-dnm.1.out b/test/trio-dnm.1.out deleted file mode 100644 index 2d21661ee..000000000 --- a/test/trio-dnm.1.out +++ /dev/null @@ -1,18 +0,0 @@ -1 100 . . 30 0 0 -1 98 . . 17 0 0 -1 99 . . 16 0 0 -1 98 . . 18 0 0 -1 92 . . 17 0 0 -1 98 . . 11 0 0 -2 5 . . 71 0 0 -2 6 . . 56 0 0 -2 5 . . 50 0 0 -3 3 . . 56 0 43 -3 3 . . 54 51 0 -3 3 . . 52 0 60 -3 0 . . 3 0 0 -3 3 . . 49 45 0 -3 3 . . 56 0 47 -3 3 . . 55 0 52 -3 3 . . 53 0 39 -3 3 . . 38 50 0 diff --git a/test/trio-dnm.2.out b/test/trio-dnm.2.out deleted file mode 100644 index f9d2242e5..000000000 --- a/test/trio-dnm.2.out +++ /dev/null @@ -1,18 +0,0 @@ -1 100 . . . . . -1 98 . . . . . -1 99 . . . . . -1 98 . . . . . -1 92 . . . . . -1 98 . . . . . -2 5 . . . . . -2 6 . . . . . -2 5 . . . . . -3 3 . . . . . -3 3 . . . . . -3 3 . . . . . -3 0 . . . . . -3 3 . . . . . -3 3 . . . . . -3 3 . . . . . -3 3 . . . . . -3 3 . . . . . diff --git a/test/trio-dnm/trio-dnm.1.out b/test/trio-dnm/trio-dnm.1.out new file mode 100644 index 000000000..73b07e225 --- /dev/null +++ b/test/trio-dnm/trio-dnm.1.out @@ -0,0 +1,18 @@ + -1.67989e-06 . . 30 0 0 + -2.23809e-06 . . 17 0 0 + -1.82511e-06 . . 16 0 0 + -2.23809e-06 . . 18 0 0 + -4.98174e-05 . . 17 0 0 + -2.23809e-06 . . 11 0 0 + -9.18309 . . 71 0 0 + -8.67535 . . 56 0 0 + -9.11616 . . 50 0 0 + -inf . . 56 0 43 + -inf . . 50 50 0 + -inf . . 50 0 50 + -inf . . 3 0 0 + -inf . . 50 50 0 + -inf . . 56 0 47 + -inf . . 50 0 50 + -inf . . 53 0 39 + -20.8357 . . 38 50 0 diff --git a/test/trio-dnm/trio-dnm.1.vcf b/test/trio-dnm/trio-dnm.1.vcf new file mode 100644 index 000000000..7d15610dd --- /dev/null +++ b/test/trio-dnm/trio-dnm.1.vcf @@ -0,0 +1,31 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##reference=file:///lustre/scratch113/resources/ref/Homo_sapiens/1000Genomes_hs37d5/hs37d5.fa +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband father mother +1 1 . G A,T . . TP GT:AD:PL 0/2:62,0,27:733,919,3060,0,2141,2060 0/0:35,0,0:0,99,1485,99,1485,1485 0/0:38,0,0:0,102,1530,102,1530,1530 +1 2 . G T,A . . TP GT:AD:PL 0/1:67,14,0:257,0,2105,458,2146,2605 0/0:36,0,0:0,99,1485,99,1485,1485 0/0:36,0,0:0,99,1485,99,1485,1485 +1 3 . G A . . TP GT:AD:PL 0/1:71,14:241,0,2314 0/0:45,0:0,101,1530 0/0:43,0:0,99,1485 +1 4 . C T . . TP GT:AD:PL 0/1:111,24:504,0,3776 0/0:35,0:0,99,1485 0/0:36,0:0,99,1485 +1 5 . C A . . TP GT:AD:PL 0/1:30,6:124,0,981 0/0:37,0:0,99,1485 0/0:32,0:0,90,1350 +1 8 . A G . . TP GT:AD:PL 0/1:434,52:859,0,18086 0/0:38,0:0,99,1485 0/0:38,0:0,99,1485 +2 1 . A G . . UN GT:AD:PL 0/1:2,5:179,0,55 0/0:7,0:0,0,180 0/0:4,0:0,12,148 +2 2 . A G . . UN GT:AD:PL 0/1:4,5:159,0,126 0/0:1,0:0,3,39 0/0:4,0:0,9,135 +2 3 . A G . . UN GT:AD:PL 0/1:4,4:137,0,107 0/0:6,0:0,18,213 0/0:8,0:0,0,232 +3 1 . A G . . FP GT:AD:PL 0/1:7,9:357,0,408 0/0:15,0:0,39,585 0/1:4,3:114,0,550 +3 2 . C A . . FP GT:AD:PL 0/1:13,15:453,0,442 0/1:29,30:913,0,1011 0/0:39,0:0,99,1485 +3 3 . A G . . FP GT:AD:PL 0/1:11,12:361,0,358 0/0:21,0:0,51,765 0/1:10,15:538,0,292 +3 4 . A G,C . . FP GT:PL:AD 0/0:0,255,255,255,255,255:306,11,0 0/0:0,255,255,255,255,255:328,1,1 0/0:0,255,255,255,255,255:318,0,0 +3 5 . A G . . FP GT:AD:PL 0/1:33,32:890,0,963 0/1:56,45:1328,0,1809 0/0:36,0:0,99,1485 +3 6 . A G . . FP GT:AD:PL 0/1:19,24:737,0,649 0/0:48,0:0,108,1620 0/1:25,22:644,0,836 +3 7 . A G . . FP GT:AD:PL 0/1:73,90:2864,0,2197 0/0:42,0:0,99,1485 0/1:69,74:2395,0,2064 +3 8 . A G . . FP GT:AD:PL 0/1:115,128:4130,0,3542 0/0:34,0:0,99,1360 0/1:137,89:2571,0,4411 +3 9 . A G . . FP GT:AD:PL 0/1:18,11:311,0,627 0/1:3,3:51,0,105 0/0:19,0:0,57,764 diff --git a/test/trio-dnm/trio-dnm.2.vcf b/test/trio-dnm/trio-dnm.2.vcf new file mode 100644 index 000000000..a36f5d0ff --- /dev/null +++ b/test/trio-dnm/trio-dnm.2.vcf @@ -0,0 +1,31 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##reference=file:///lustre/scratch113/resources/ref/Homo_sapiens/1000Genomes_hs37d5/hs37d5.fa +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband father mother +1 1 . G A,T . . TP GT:AD:PL 0/2:62,0,27,0:733,919,3060,0,2141,2060 0/0:35,0,0:0,99,1485,99,1485,1485 0/0:38,0,0:0,102,1530,102,1530,1530 +1 2 . G T,A . . TP GT:AD:PL 0/1:67,14,0,0:257,0,2105,458,2146,2605 0/0:36,0,0:0,99,1485,99,1485,1485 0/0:36,0,0:0,99,1485,99,1485,1485 +1 3 . G A . . TP GT:AD:PL 0/1:71,14,0:241,0,2314 0/0:45,0:0,101,1530 0/0:43,0:0,99,1485 +1 4 . C T . . TP GT:AD:PL 0/1:111,24,0:504,0,3776 0/0:35,0:0,99,1485 0/0:36,0:0,99,1485 +1 5 . C A . . TP GT:AD:PL 0/1:30,6,0:124,0,981 0/0:37,0:0,99,1485 0/0:32,0:0,90,1350 +1 8 . A G . . TP GT:AD:PL 0/1:434,52,0:859,0,18086 0/0:38,0:0,99,1485 0/0:38,0:0,99,1485 +2 1 . A G . . UN GT:AD:PL 0/1:2,5,0:179,0,55 0/0:7,0:0,0,180 0/0:4,0:0,12,148 +2 2 . A G . . UN GT:AD:PL 0/1:4,5,0:159,0,126 0/0:1,0:0,3,39 0/0:4,0:0,9,135 +2 3 . A G . . UN GT:AD:PL 0/1:4,4,0:137,0,107 0/0:6,0:0,18,213 0/0:8,0:0,0,232 +3 1 . A G . . FP GT:AD:PL 0/1:7,9,0,0:357,0,408 0/0:15,0:0,39,585 0/1:4,3:114,0,550 +3 2 . C A . . FP GT:AD:PL 0/1:13,15,0:453,0,442 0/1:29,30:913,0,1011 0/0:39,0:0,99,1485 +3 3 . A G . . FP GT:AD:PL 0/1:11,12,0:361,0,358 0/0:21,0:0,51,765 0/1:10,15:538,0,292 +3 4 . A G,C . . FP GT:PL:AD 0/0:0,255,255,255,255,255:306,11,0,0 0/0:0,255,255,255,255,255:328,1,1 0/0:0,255,255,255,255,255:318,0,0 +3 5 . A G . . FP GT:AD:PL 0/1:33,32,0:890,0,963 0/1:56,45:1328,0,1809 0/0:36,0:0,99,1485 +3 6 . A G . . FP GT:AD:PL 0/1:19,24,0:737,0,649 0/0:48,0:0,108,1620 0/1:25,22:644,0,836 +3 7 . A G . . FP GT:AD:PL 0/1:73,90,0:2864,0,2197 0/0:42,0:0,99,1485 0/1:69,74:2395,0,2064 +3 8 . A G . . FP GT:AD:PL 0/1:115,128,0:4130,0,3542 0/0:34,0:0,99,1360 0/1:137,89:2571,0,4411 +3 9 . A G . . FP GT:AD:PL 0/1:18,11,0:311,0,627 0/1:3,3:51,0,105 0/0:19,0:0,57,764 diff --git a/test/trio-dnm/trio-dnm.4.1.out b/test/trio-dnm/trio-dnm.4.1.out new file mode 100644 index 000000000..817198513 --- /dev/null +++ b/test/trio-dnm/trio-dnm.4.1.out @@ -0,0 +1 @@ + 255 . . 55 0 0 diff --git a/test/trio-dnm/trio-dnm.4.2.out b/test/trio-dnm/trio-dnm.4.2.out new file mode 100644 index 000000000..77de513cb --- /dev/null +++ b/test/trio-dnm/trio-dnm.4.2.out @@ -0,0 +1 @@ + 0 . . 55 0 0 diff --git a/test/trio-dnm/trio-dnm.4.vcf b/test/trio-dnm/trio-dnm.4.vcf new file mode 100644 index 000000000..957773571 --- /dev/null +++ b/test/trio-dnm/trio-dnm.4.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband mother father +1 94466455 . C T,<*> 0 . . PL:DP:AD:QS 255,0,255,255,255,255:87:39,48,0:1278,1649,0 0,196,255,196,255,255:65:65,0,0:2151,0,0 0,202,255,202,255,255:67:67,0,0:2211,0,0 diff --git a/test/trio-dnm/trio-dnm.5.1.out b/test/trio-dnm/trio-dnm.5.1.out new file mode 100644 index 000000000..641c007f5 --- /dev/null +++ b/test/trio-dnm/trio-dnm.5.1.out @@ -0,0 +1 @@ + -3.26266 . . 14 0 0 diff --git a/test/trio-dnm/trio-dnm.5.vcf b/test/trio-dnm/trio-dnm.5.vcf new file mode 100644 index 000000000..b31ac1ee6 --- /dev/null +++ b/test/trio-dnm/trio-dnm.5.vcf @@ -0,0 +1,10 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband mother father +19 1393268 . C T,A,<*> 0 . . PL:DP:AD:QS 66,0,255,159,255,255,159,255,255,255:36:31,5,0,0:1063,218,0,0 0,202,255,173,255,255,202,255,255,255:68:67,0,1,0:2481,0,32,0 0,138,255,138,255,255,138,255,255,255:46:46,0,0,0:1707,0,0,0 diff --git a/test/trio-dnm/trio-dnm.6.1.out b/test/trio-dnm/trio-dnm.6.1.out new file mode 100644 index 000000000..318f7f294 --- /dev/null +++ b/test/trio-dnm/trio-dnm.6.1.out @@ -0,0 +1 @@ + -5.94778 . . 70 0 0 diff --git a/test/trio-dnm/trio-dnm.6.2.out b/test/trio-dnm/trio-dnm.6.2.out new file mode 100644 index 000000000..717b76aa5 --- /dev/null +++ b/test/trio-dnm/trio-dnm.6.2.out @@ -0,0 +1 @@ + -3.16223e-05 . . 70 0 0 diff --git a/test/trio-dnm/trio-dnm.6.vcf b/test/trio-dnm/trio-dnm.6.vcf new file mode 100644 index 000000000..6a88b1ddd --- /dev/null +++ b/test/trio-dnm/trio-dnm.6.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband mother father +19 33584985 . C G,<*> 0 . . PL:DP:ADF:ADR:AD:QS 125,0,54,134,75,189:10:2,7,0:1,0,0:3,7,0:85,243,0 0,18,160,18,160,160:6:5,0,0:1,0,0:6,0,0:213,0,0 0,45,231,45,231,231:15:13,0,0:2,0,0:15,0,0:487,0,0 diff --git a/test/trio-dnm/trio-dnm.7.1.out b/test/trio-dnm/trio-dnm.7.1.out new file mode 100644 index 000000000..0a01ada90 --- /dev/null +++ b/test/trio-dnm/trio-dnm.7.1.out @@ -0,0 +1 @@ + -15.1971 . . 19 0 0 diff --git a/test/trio-dnm/trio-dnm.7.vcf b/test/trio-dnm/trio-dnm.7.vcf new file mode 100644 index 000000000..79d544fc7 --- /dev/null +++ b/test/trio-dnm/trio-dnm.7.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband mother father +1 103484311 . T G,<*> 0 . . PL:DP:ADF:ADR:AD:QS 14,0,97,80,112,143:27:22,5,0:0,0,0:22,5,0:424,111,0 0,54,133,54,133,133:18:18,0,0:0,0,0:18,0,0:375,0,0 0,60,160,60,160,160:20:20,0,0:0,0,0:20,0,0:565,0,0 diff --git a/test/trio-dnm/trio-dnm.8.vcf b/test/trio-dnm/trio-dnm.8.vcf new file mode 100644 index 000000000..f4c0fb808 --- /dev/null +++ b/test/trio-dnm/trio-dnm.8.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband mother father +9 84010138 . C G,A,<*> 0 . . PL:DP:ADF:ADR:AD:AQ 255,0,255,255,255,255,255,255,255,255:193:38,40,0,0:51,64,0,0:89,104,0,0:3318,3653,0,0 0,255,255,255,255,255,255,255,255,255:192:79,0,0,0:112,0,1,0:191,0,1,0:7064,0,15,0 0,121,255,255,255,255,255,255,255,255:154:44,4,0,0:98,8,0,0:142,12,0,0:5369,416,0,0 diff --git a/test/trio-dnm/trio-dnm.9.vcf b/test/trio-dnm/trio-dnm.9.vcf new file mode 100644 index 000000000..f1eb849f8 --- /dev/null +++ b/test/trio-dnm/trio-dnm.9.vcf @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband mother father +8 609051 . G C,A,<*> 0 . . PL:DP:ADF:ADR:AD:AQ 82,0,66,112,81,159,112,81,159,159:15:6,2,0,0:4,3,0,0:10,5,0,0:136,136,0,0 17,47,91,0,78,63,47,91,78,91:15:7,0,5,0:3,0,0,0:10,0,5,0:133,0,62,0 0,20,125,27,128,125,27,128,125,125:10:5,0,0,0:4,1,0,0:9,1,0,0:167,4,0,0 From ff5dc521d91420e797aa745a8177763be13692b3 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 30 Nov 2020 21:57:33 +0000 Subject: [PATCH 24/81] merge_format_string: Enlarge array for BCF_VL_A/BCF_VL_R too Fixes #1353. Lengthen strings in merge.6.b.vcf to such an extent that merge_format_string() crashes on this test case without this fix. --- test/merge.6.a.vcf | 4 ++-- test/merge.6.b.vcf | 2 +- test/merge.6.out | 4 ++-- vcfmerge.c | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/test/merge.6.a.vcf b/test/merge.6.a.vcf index a46641a8c..9367de25d 100644 --- a/test/merge.6.a.vcf +++ b/test/merge.6.a.vcf @@ -5,5 +5,5 @@ ##contig= ##reference=file:///home/dnanexus/genome.fa #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT t1 -1 11080563 . A C . . . GT:RFS:AFS 0/1:a,c:c -1 11080564 . A C . . . GT:RFS:AFS 0/1:a,c:c +1 11080563 . A C . . . GT:RFS:AFS 0/1:aa,c:c +1 11080564 . A C . . . GT:RFS:AFS 0/1:a,cc:c diff --git a/test/merge.6.b.vcf b/test/merge.6.b.vcf index fd760300e..3338c9258 100644 --- a/test/merge.6.b.vcf +++ b/test/merge.6.b.vcf @@ -7,4 +7,4 @@ ##reference=file:///home/dnanexus/genome.fa #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT t2 1 11080563 . A C . . . GT:FFS:AFS:RFS 0/1:xx,yy:C:A,C -1 11080564 . A T . . . GT:FFS:AFS:RFS 0/1:xx,yy:T:A,T +1 11080564 . A T . . . GT:FFS:AFS:RFS 0/1:xx,yy:TTTTATGCATGCATGCTTTTTTTTATGCATGCATGCTTTT:A,T diff --git a/test/merge.6.out b/test/merge.6.out index 23c536fb8..112d4b494 100644 --- a/test/merge.6.out +++ b/test/merge.6.out @@ -7,5 +7,5 @@ ##reference=file:///home/dnanexus/genome.fa ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT t1 t2 -1 11080563 . A C . . . GT:RFS:AFS:FFS 0/1:a,c:c:. 0/1:A,C:C:xx,yy -1 11080564 . A C,T . . . GT:RFS:AFS:FFS 0/1:a,c,.:c,.:. 0/2:A,.,T:.,T:xx,yy +1 11080563 . A C . . . GT:RFS:AFS:FFS 0/1:aa,c:c:. 0/1:A,C:C:xx,yy +1 11080564 . A C,T . . . GT:RFS:AFS:FFS 0/1:a,cc,.:c,.:. 0/2:A,.,T:.,TTTTATGCATGCATGCTTTTTTTTATGCATGCATGCTTTT:xx,yy diff --git a/vcfmerge.c b/vcfmerge.c index 7c79ada3a..fd8af8d3a 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -1726,6 +1726,7 @@ void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew); if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%"PRId64" .. %d\n",__func__,bcf_seqname(hdr,line),(int64_t) line->pos+1,ret); } + if ( nmax < str->l ) nmax = str->l; src += fmt_ori->size; } continue; From 1fdc54162a99b3e433f633788042124a102a9db8 Mon Sep 17 00:00:00 2001 From: Nicholas Knoblauch Date: Mon, 30 Nov 2020 17:49:50 -0600 Subject: [PATCH 25/81] remove error before switch when fmt->type!=int8_t --- filter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/filter.c b/filter.c index e375fb2e3..a8ee74367 100644 --- a/filter.c +++ b/filter.c @@ -1060,8 +1060,7 @@ static void filters_set_nmissing(filter_t *flt, bcf1_t *line, token_t *tok) tok->nvalues = 0; return; } - if ( fmt->type!=BCF_BT_INT8 ) error("TODO: the GT fmt_type is not int8\n"); - + int j,nmissing = 0; #define BRANCH(type_t, is_vector_end) { \ for (i=0; in_sample; i++) \ From 0f4aed2936fdfc9df7f81e94ea8f22071acf236d Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 7 Dec 2020 14:00:13 +0000 Subject: [PATCH 26/81] [NEWS] Revamp of `bcftools call -G` Sample grouping by population was not truly independent and could still be influenced by the presence of other sample groups. With this commit, -G should work as expected. Other changes are: - Optional addition of INFO/PV4 annotation with `call -a INFO/PV4` - Remove generation of useless HOB and ICB annotation; use +fill-tags -- -t HWE,ExcHet` instead - The `call -f` option was renamed to `-a` to (1) make it consistent with `mpileup` and (2) to indicate that it includes both INFO and FORMAT annotations, not just FORMAT as previously - Any sensible Number=R,Type=Integer annotation can be used with -G, such as AD or QS - Don't trim QUAL; although usefuleness of this change is questionable for true probabilistic interpretation (such high precision is unrealistic), using QUAL as a score rather than probability *is* helpful and permits more fine-grained filtering - Fix a suspected bug in `call -F` in the worst case, for certain improve readability - Note that `call -C trio` is temporarily disabled by this commit Fixes #1332 --- call.h | 34 +- ccall.c | 4 +- doc/bcftools.1 | 38 +- doc/bcftools.html | 34 +- doc/bcftools.txt | 10 +- mcall.c | 777 +++++++++++++++++------------------- test/call-G.1.out | 17 + test/call-G.2.1.out | 27 ++ test/call-G.2.out | 17 + test/call-G.2.vcf | 24 ++ test/call-G.vcf | 13 + test/call.af-fixation.1.out | 13 + test/call.af-fixation.2.out | 13 + test/call.af-fixation.3.out | 15 + test/call.af-fixation.txt | 35 ++ test/call.af-fixation.vcf | 9 + test/check.chk | 22 +- test/check_merge.chk | 20 +- test/mpileup.1.out | 24 +- test/mpileup.2.out | 24 +- test/mpileup.3.out | 24 +- test/mpileup.4.out | 24 +- test/mpileup.5.out | 24 +- test/mpileup.X.2.out | 24 +- test/mpileup.X.out | 24 +- test/mpileup.cAls.2.out | 12 +- test/mpileup.cAls.3.out | 4 +- test/mpileup.cAls.4.out | 6 +- test/mpileup.cAls.5.out | 4 +- test/mpileup.cAls.6.out | 6 +- test/mpileup.cAls.7.out | 8 +- test/mpileup.cAls.out | 20 +- test/mpileup.cals.8.out | 4 +- test/mpileup.cals.9.out | 2 - test/mpileup.hwe.1.out | 4 +- test/mpileup.hwe.1b.out | 25 ++ test/mpileup.hwe.2.out | 4 +- test/mpileup.hwe.3.out | 6 +- test/mpileup.hwe.4.out | 4 +- test/test.pl | 12 +- test/trio-dnm.1.vcf | 31 -- test/trio-dnm.2.vcf | 31 -- vcfcall.c | 90 +++-- 43 files changed, 852 insertions(+), 711 deletions(-) create mode 100644 test/call-G.1.out create mode 100644 test/call-G.2.1.out create mode 100644 test/call-G.2.out create mode 100644 test/call-G.2.vcf create mode 100644 test/call-G.vcf create mode 100644 test/call.af-fixation.1.out create mode 100644 test/call.af-fixation.2.out create mode 100644 test/call.af-fixation.3.out create mode 100644 test/call.af-fixation.txt create mode 100644 test/call.af-fixation.vcf create mode 100644 test/mpileup.hwe.1b.out delete mode 100644 test/trio-dnm.1.vcf delete mode 100644 test/trio-dnm.2.vcf diff --git a/call.h b/call.h index 50e4815ab..4ea5443c3 100644 --- a/call.h +++ b/call.h @@ -34,7 +34,7 @@ THE SOFTWARE. */ #define CALL_CONSTR_TRIO (1<<2) #define CALL_CONSTR_ALLELES (1<<3) // -// +#define CALL_FMT_PV4 (1<<5) #define CALL_FMT_GQ (1<<6) #define CALL_FMT_GP (1<<7) @@ -52,18 +52,13 @@ family_t; // For the single-sample and grouped -G calling typedef struct { + double ref_lk, max_lk, lk_sum; float *qsum; // QS(quality sum) values - int nqsum, dp; - double fa,fb,fc,fa2,fb2,fc2,fab,fac,fbc; -} -grp1_t; -typedef struct -{ - grp1_t *grp; - int ngrp; - int *smpl2grp; + int nqsum; + uint32_t *smpl, nsmpl; + uint32_t nals, als; } -grp_t; +smpl_grp_t; // For the `-C alleles -i` constrained calling typedef struct @@ -82,6 +77,7 @@ typedef struct int *pl_map, npl_map; // same as above for PLs, but reverse (new -> old) char **als; // array to hold the trimmed set of alleles to appear on output int nals; // size of the als array + int als_new, nals_new; // bitmask with final alleles and their number family_t *fams; // list of families and samples for trio calling int nfams, mfams; int ntrio[5][5]; // possible trio genotype combinations and their counts; first idx: @@ -96,18 +92,16 @@ typedef struct int32_t *ugts, *cgts; // unconstraind and constrained GTs uint32_t output_tags; char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN) - tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES - char *sample_groups; // for single-sample or grouped calling with -G - grp_t smpl_grp; - float *qsum; - int nqsum; + tgt_als_t *tgt_als; // for CALL_CONSTR_ALLELES + char *sample_groups; // for single-sample or grouped calling with -G + char *sample_groups_tag; // for -G [AD|QS:] + smpl_grp_t *smpl_grp; + int nsmpl_grp; // ccall only double indel_frac, min_perm_p, min_lrt; double prior_type, pref; - double ref_lk, lk_sum; int ngrp1_samples, n_perm; - int nhets, ndiploid; char *prior_file; ccall_t *cdat; @@ -149,7 +143,7 @@ void qcall_destroy(call_t *call); void call_init_pl2p(call_t *call); uint32_t *call_trio_prep(int is_x, int is_son); -void init_allele_trimming_maps(call_t *call, int als, int nals); -void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als); +void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out); +void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new); #endif diff --git a/ccall.c b/ccall.c index 126ed92d0..6bf987b69 100644 --- a/ccall.c +++ b/ccall.c @@ -303,8 +303,8 @@ static int update_bcf1(call_t *call, bcf1_t *rec, const bcf_p1rst_t *pr, double // trim Number=R tags int out_als = 0; for (i=0; i -.\" Date: 2020-09-22 +.\" Date: 2020-11-25 16:08 GMT .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2020\-09\-22" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2020\-11\-25 16:08 GMT" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Define some portability stuff .\" ----------------------------------------------------------------- @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&. .SS "VERSION" .sp -This manual page was last updated \fB2020\-09\-22\fR and refers to bcftools git version \fB1\&.11\fR\&. +This manual page was last updated \fB2020\-11\-25 16:08 GMT\fR and refers to bcftools git version \fB1\&.11\-24\-g9718479+\fR\&. .SS "BCF1" .sp The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&. @@ -664,6 +664,15 @@ See also the option\&. .RE .PP +\fB\-C, \-\-columns\-file\fR \fIfile\fR +.RS 4 +Read the list of columns from a file (normally given via the +\fB\-c, \-\-columns\fR +option)\&. "\-" to skip a column of the annotation file\&. One column name per row, an additional space\- or tab\-separated field can be present to indicate the merge logic (normally given via the +\fB\-l, \-\-merge\-logic\fR +option)\&. This is useful when many annotations are added at once\&. +.RE +.PP \fB\-e, \-\-exclude\fR \fIEXPRESSION\fR .RS 4 exclude sites for which @@ -730,11 +739,11 @@ and expressions instead of discarding them .RE .PP -\fB\-l, \-\-merge\-logic\fR \fItag\fR:\*(Aqfirst\*(Aq|\fIappend\fR|\fIunique\fR|\fIsum\fR|\fIavg\fR|\fImin\fR|\fImax\fR[,\&...] +\fB\-l, \-\-merge\-logic\fR \fItag\fR:\*(Aqfirst\*(Aq|\fIappend\fR|\fIappend\-missing\fR|\fIunique\fR|\fIsum\fR|\fIavg\fR|\fImin\fR|\fImax\fR[,\&...] .RS 4 if multiple regions overlap a single record, the option defines how to treat multiple annotation values when setting \fItag\fR -in the destination file: use the first encountered value ignoring the rest (\fIfirst\fR); append allowing duplicates (\fIappend\fR); append discarding duplicate values (\fIunique\fR); sum the values (\fIsum\fR, numeric fields only); average the values (\fIavg\fR); use the minimum value (\fImin\fR) or the maximum (\fImax\fR)\&. +in the destination file: use the first encountered value ignoring the rest (\fIfirst\fR); append allowing duplicates (\fIappend\fR); append even if the appended value is missing, i\&.e\&. is a dot (\fIappend\-missing\fR); append discarding duplicate values (\fIunique\fR); sum the values (\fIsum\fR, numeric fields only); average the values (\fIavg\fR); use the minimum value (\fImin\fR) or the maximum (\fImax\fR)\&. Note that this option is intended for use with BED or TAB\-delimited annotation files only\&. Moreover, it is effective only when either \fIREF\fR @@ -787,6 +796,12 @@ see \fBCommon Options\fR .RE .PP +\fB\-\-rename\-annots\fR \fIfile\fR +.RS 4 +rename annotations according to the map in +\fIfile\fR, with "old_name new_name\en" pairs separated by whitespaces, each on a separate line\&. The old name must be prefixed with the annotation type: INFO, FORMAT, or FILTER\&. +.RE +.PP \fB\-\-rename\-chrs\fR \fIfile\fR .RS 4 rename chromosomes according to the map in @@ -976,7 +991,7 @@ output all alternate alleles present in the alignments even if they do not appea .PP \fB\-f, \-\-format\-fields\fR \fIlist\fR .RS 4 -comma\-separated list of FORMAT fields to output for each sample\&. Currently GQ and GP fields are supported\&. For convenience, the fields can be given as lower case letters\&. +comma\-separated list of FORMAT fields to output for each sample\&. Currently GQ and GP fields are supported\&. For convenience, the fields can be given as lower case letters\&. Prefixed with "^" indicates a request for tag removal of auxiliary tags useful only for calling\&. .RE .PP \fB\-F, \-\-prior\-freqs\fR \fIAN\fR,\fIAC\fR @@ -1015,11 +1030,10 @@ is a tab\-delimited text file with sample names in the first column and group na \fI\-\fR is given instead, no HWE assumption is made at all and single\-sample calling is performed\&. (Note that in low coverage data this inflates the rate of false positives\&.) The \fB\-G\fR -option requires the presence of FORMAT/AD generated at the -\fBbcftools mpileup\fR -step by providing the -\fB\-a FMT/AD\fR -option\&. +option requires the presence of per\-sample FORMAT/QS or FORMAT/AD tag generated with +\fBbcftools mpileup \-a QS\fR +(or +\fB\-a AD\fR)\&. .RE .PP \fB\-g, \-\-gvcf\fR \fIINT\fR @@ -2279,7 +2293,7 @@ Homozygous genotypes only, useful with low coverage data (requires .PP \fB\-\-n\-matches\fR \fIINT\fR .RS 4 -Print only top INT matches for each sample, 0 for unlimited\&. Use negative value to sort by HWE probability rather than the number of discordant sites\&. +Print only top INT matches for each sample, 0 for unlimited\&. Use negative value to sort by HWE probability rather than the number of discordant sites\&. Note that average score is used to determine the top matches, not absolute values\&. .RE .PP \fB\-\-no\-HWE\-prob\fR diff --git a/doc/bcftools.html b/doc/bcftools.html index fd918301a..95b873b91 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -1,6 +1,6 @@ -bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant +bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.

Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF @@ -10,7 +10,7 @@ (Note that files with non-standard index names can be accessed as e.g. "bcftools view -r X:2928329 file.vcf.gz##idx##non-standard-index-name".)

BCFtools is designed to work on a stream. It regards an input file "-" as the standard input (stdin) and outputs to the standard output (stdout). Several -commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2020-09-22 and refers to bcftools git version 1.11.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not +commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2020-11-25 16:08 GMT and refers to bcftools git version 1.11-24-g9718479+.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by @@ -286,6 +286,14 @@ See also the -l, --merge-logic option.

+-C, --columns-file file +
+ Read the list of columns from a file (normally given via the -c, --columns option). + "-" to skip a column of the annotation file. + One column name per row, an additional space- or tab-separated field can + be present to indicate the merge logic (normally given via the -l, --merge-logic option). + This is useful when many annotations are added at once. +
-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -318,11 +326,12 @@
keep sites which do not pass -i and -e expressions instead of discarding them
--l, --merge-logic tag:'first'|append|unique|sum|avg|min|max[,…] +-l, --merge-logic tag:'first'|append|append-missing|unique|sum|avg|min|max[,…]
if multiple regions overlap a single record, the option defines how to treat multiple annotation values when setting tag in the destination file: use the first encountered value ignoring - the rest (first); append allowing duplicates (append); append discarding duplicate values (unique); + the rest (first); append allowing duplicates (append); append even if the appended value is missing, + i.e. is a dot (append-missing); append discarding duplicate values (unique); sum the values (sum, numeric fields only); average the values (avg); use the minimum value (min) or the maximum (max). @@ -356,6 +365,13 @@
see Common Options
+--rename-annots file +
+ rename annotations according to the map in file, with + "old_name new_name\n" pairs separated by whitespaces, each on a separate + line. The old name must be prefixed with the annotation type: + INFO, FORMAT, or FILTER. +
--rename-chrs file
rename chromosomes according to the map in file, with @@ -490,7 +506,8 @@
comma-separated list of FORMAT fields to output for each sample. Currently GQ and GP fields are supported. For convenience, the fields can be given - as lower case letters. + as lower case letters. Prefixed with "^" indicates a request for tag + removal of auxiliary tags useful only for calling.
-F, --prior-freqs AN,AC
@@ -510,14 +527,14 @@ # Now before calling, stream the raw mpileup output through `bcftools annotate` to add the frequencies bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]
--G, --group-samples FILE|- +-G, --group-samples FILE|-
by default, all samples are assumed to come from a single population. This option allows to group samples into populations and apply the HWE assumption within but not across the populations. FILE is a tab-delimited text file with sample names in the first column and group names in the second column. If - is given instead, no HWE assumption is made at all and single-sample calling is performed. (Note that in low coverage data this inflates the rate of false positives.) The -G option requires the presence of - FORMAT/AD generated at the bcftools mpileup step by providing the -a FMT/AD option. + per-sample FORMAT/QS or FORMAT/AD tag generated with bcftools mpileup -a QS (or -a AD).
-g, --gvcf INT
@@ -1361,7 +1378,8 @@ --n-matches INT
Print only top INT matches for each sample, 0 for unlimited. Use negative value - to sort by HWE probability rather than the number of discordant sites. + to sort by HWE probability rather than the number of discordant sites. Note + that average score is used to determine the top matches, not absolute values.
--no-HWE-prob
diff --git a/doc/bcftools.txt b/doc/bcftools.txt index a0a0d3ac2..5ece9eb8f 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -552,7 +552,8 @@ demand. The original calling model can be invoked with the *-c* option. *-f, --format-fields* 'list':: comma-separated list of FORMAT fields to output for each sample. Currently GQ and GP fields are supported. For convenience, the fields can be given - as lower case letters. + as lower case letters. Prefixed with "^" indicates a request for tag + removal of auxiliary tags useful only for calling. *-F, --prior-freqs* 'AN','AC':: take advantage of prior knowledge of population allele frequencies. The @@ -574,13 +575,13 @@ demand. The original calling model can be invoked with the *-c* option. bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...] ---- -*-G, --group-samples* 'FILE'|'-':: +*-G, --group-samples* [TAG:]'FILE'|'-':: by default, all samples are assumed to come from a single population. This option allows to group samples into populations and apply the HWE assumption within but not across the populations. 'FILE' is a tab-delimited text file with sample names in the first column and group names in the second column. If '-' is given instead, no HWE assumption is made at all and single-sample calling is performed. (Note that in low coverage data this inflates the rate of false positives.) The *-G* option requires the presence of - FORMAT/AD generated at the *bcftools mpileup* step by providing the *-a FMT/AD* option. + per-sample FORMAT/QS or FORMAT/AD tag generated with *bcftools mpileup -a QS* (or *-a AD*). *-g, --gvcf* 'INT':: output also gVCF blocks of homozygous REF calls. The parameter 'INT' is the @@ -1398,7 +1399,8 @@ Without the *-g* option, multi-sample cross-check of samples in 'query.vcf.gz' i *--n-matches* 'INT':: Print only top INT matches for each sample, 0 for unlimited. Use negative value - to sort by HWE probability rather than the number of discordant sites. + to sort by HWE probability rather than the number of discordant sites. Note + that average score is used to determine the top matches, not absolute values. *--no-HWE-prob*:: Disable calculation of HWE probability to reduce memory requirements with diff --git a/mcall.c b/mcall.c index c24a5a751..e02157a85 100644 --- a/mcall.c +++ b/mcall.c @@ -1,6 +1,6 @@ /* mcall.c -- multiallelic and rare variant calling. - Copyright (C) 2012-2016 Genome Research Ltd. + Copyright (C) 2012-2020 Genome Research Ltd. Author: Petr Danecek @@ -25,9 +25,11 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include "call.h" +#include "prob1.h" // Using priors for GTs does not seem to be mathematically justified. Although // it seems effective in removing false calls, it also flips a significant @@ -39,6 +41,7 @@ THE SOFTWARE. */ // genotypes is reported instead. #define FLAT_PDG_FOR_MISSING 0 +int test16(float *anno16, anno16_t *a); void qcall_init(call_t *call) { return; } void qcall_destroy(call_t *call) { return; } @@ -250,60 +253,104 @@ static void init_sample_groups(call_t *call) if ( !call->sample_groups ) { // standard pooled calling, all samples in the same group - grp_t *grps = &call->smpl_grp; - grps->ngrp = 1; - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)calloc(nsmpl,sizeof(int)); + call->nsmpl_grp = 1; + call->smpl_grp = (smpl_grp_t*)calloc(1,sizeof(*call->smpl_grp)); + call->smpl_grp[0].nsmpl = nsmpl; + call->smpl_grp[0].smpl = (uint32_t*)calloc(call->smpl_grp[0].nsmpl,sizeof(uint32_t)); + for (i=0; ismpl_grp[0].smpl[i] = i; + return; + } + + // Parse tag (optional) and file name + char *fname = call->sample_groups; + while ( *fname && *fname!=':' ) fname++; + if ( *fname ) + { + call->sample_groups_tag = call->sample_groups; + *fname = 0; + fname++; + + // Is the tag defined in the header? + int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag); + if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag); + if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) error("No such FORMAT tag \"%s\"\n", call->sample_groups_tag); + } + else + { + int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"QS"); + if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "QS"; + else + { + tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,"AD"); + if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD"; + else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n"); + } + fname = call->sample_groups; } - else if ( !strcmp("-",call->sample_groups) ) + + // Read samples/groups + if ( !strcmp("-",fname) ) { // single-sample calling, each sample creates its own group - grp_t *grps = &call->smpl_grp; - grps->ngrp = nsmpl; - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); - for (i=0; ismpl2grp[i] = i; + call->nsmpl_grp = nsmpl; + call->smpl_grp = (smpl_grp_t*)calloc(nsmpl,sizeof(*call->smpl_grp)); + for (i=0; ismpl_grp[i].nsmpl = 1; + call->smpl_grp[i].smpl = (uint32_t*)calloc(call->smpl_grp[i].nsmpl,sizeof(uint32_t)); + call->smpl_grp[i].smpl[0] = i; + } } else { int nlines; - char **lines = hts_readlist(call->sample_groups, 1, &nlines); - if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); + char **lines = hts_readlist(fname, 1, &nlines); + if ( !lines ) error("Could not read the file: %s\n", fname); - uint32_t *smpl2grp1 = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); + uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); + uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); void *grp2idx = khash_str2int_init(); - grp_t *grps = &call->smpl_grp; + call->nsmpl_grp = 0; for (i=0; isample_groups,lines[i]); - *ptr = 0; + while ( *ptr && !isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",fname,lines[i]); + char *tmp = ptr; + while ( *ptr && isspace(*ptr) ) ptr++; + if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",fname,lines[i]); + *tmp = 0; int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); if ( ismpl<0 ) continue; - if ( smpl2grp1[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); + if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],fname); if ( !khash_str2int_has_key(grp2idx,ptr+1) ) { - khash_str2int_inc(grp2idx, ptr+1); - grps->ngrp++; + khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp); + call->nsmpl_grp++; } - int igrp; - if ( khash_str2int_get(grp2idx, ptr+1, &igrp)==0 ) - smpl2grp1[ismpl] = igrp+1; - else + int igrp = -1; + if ( khash_str2int_get(grp2idx, ptr+1, &igrp)!=0 ) error("This should not happen, fixme: %s\n",ptr+1); + grp2n[igrp]++; + smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples } khash_str2int_destroy(grp2idx); + if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", fname); - grps->grp = (grp1_t*)calloc(grps->ngrp, sizeof(grp1_t)); - grps->smpl2grp = (int*)malloc(nsmpl*sizeof(int)); + call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp)); for (i=0; ihdr->samples[i],call->sample_groups); - grps->smpl2grp[i] = smpl2grp1[i] - 1; + if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],fname); + int igrp = smpl2grp[i] - 1; + if ( !call->smpl_grp[igrp].nsmpl ) + call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t)); + call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i; + call->smpl_grp[igrp].nsmpl++; } - free(smpl2grp1); + free(smpl2grp); + free(grp2n); for (i=0; ismpl_grp; - for (i=0; ingrp; i++) - free(grps->grp[i].qsum); - free(grps->grp); - free(grps->smpl2grp); + for (i=0; insmpl_grp; i++) + { + free(call->smpl_grp[i].qsum); + free(call->smpl_grp[i].smpl); + } + free(call->smpl_grp); } void mcall_init(call_t *call) { + init_sample_groups(call); call_init_pl2p(call); call->nals_map = 5; @@ -342,15 +391,15 @@ void mcall_init(call_t *call) if ( call->output_tags & CALL_FMT_GQ ) bcf_hdr_append(call->hdr,"##FORMAT="); if ( call->output_tags & CALL_FMT_GP ) - bcf_hdr_append(call->hdr,"##FORMAT="); + bcf_hdr_append(call->hdr,"##FORMAT="); if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) call->GQs = (int32_t*) malloc(sizeof(int32_t)*bcf_hdr_nsamples(call->hdr)); - bcf_hdr_append(call->hdr,"##INFO="); - bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); bcf_hdr_append(call->hdr,"##INFO="); + if ( call->output_tags & CALL_FMT_PV4 ) + bcf_hdr_append(call->hdr,"##INFO=\n"); // init the prior if ( call->theta>0 ) @@ -373,8 +422,6 @@ void mcall_init(call_t *call) } call->theta = log(call->theta); } - - init_sample_groups(call); } void mcall_destroy(call_t *call) @@ -395,7 +442,6 @@ void mcall_destroy(call_t *call) free(call->pdg); free(call->als); free(call->ac); - free(call->qsum); return; } @@ -506,14 +552,14 @@ void set_pdg(double *pl2p, int *PLs, double *pdg, int n_smpl, int n_gt, int unse } // Create mapping between old and new (trimmed) alleles -void init_allele_trimming_maps(call_t *call, int als, int nals) +void init_allele_trimming_maps(call_t *call, int nals_ori, int als_out) { - int i, j; + int i, j, nout = 0; // als_map: old(i) -> new(j) - for (i=0, j=0; ials_map[i] = j++; + if ( als_out & (1<als_map[i] = nout++; else call->als_map[i] = -1; } @@ -521,85 +567,16 @@ void init_allele_trimming_maps(call_t *call, int als, int nals) // pl_map: new(k) -> old(l) int k = 0, l = 0; - for (i=0; ipl_map[k++] = l; + if ( (als_out & (1<pl_map[k++] = l; l++; } } } -double binom_dist(int N, double p, int k) -{ - int mean = (int) (N*p); - if ( mean==k ) return 1.0; - - double log_p = (k-mean)*log(p) + (mean-k)*log(1.0-p); - if ( k > N - k ) k = N - k; - if ( mean > N - mean ) mean = N - mean; - - if ( k < mean ) { int tmp = k; k = mean; mean = tmp; } - double diff = k - mean; - - double val = 1.0; - int i; - for (i=0; i10 && (1-q)*ndiploid>10 ) || ndiploid>200 ) - { - //fprintf(stderr,"out: mean=%e p=%e\n", mean,exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q)))); - return exp(-0.5*(nhets-mean)*(nhets-mean)/(mean*(1-q))); - } - - return binom_dist(ndiploid, q, nhets); -} - -float calc_HOB(int nref, int nalt, int nhets, int ndiploid) -{ - if ( !nref || !nalt || !ndiploid ) return HUGE_VAL; - - double fref = (double)nref/(nref+nalt); // fraction of reference allelels - double falt = (double)nalt/(nref+nalt); // non-ref als - return fabs((double)nhets/ndiploid - 2*fref*falt); -} - -/** - * log(sum_i exp(a_i)) - */ -// static inline double logsumexp(double *vals, int nvals) -// { -// int i; -// double max_exp = vals[0]; -// for (i=1; ihdr); + int nsmpl = grp->nsmpl; int ngts = nals*(nals+1)/2; // Single allele @@ -635,60 +611,45 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) double lk_tot = 0; int lk_tot_set = 0; int iaa = (ia+1)*(ia+2)/2-1; // index in PL which corresponds to the homozygous "ia/ia" genotype - int isample; - double *pdg = call->pdg + iaa; - for (isample=0; isamplepdg + grp->smpl[ismpl]*ngts + iaa; if ( *pdg ) { lk_tot += log(*pdg); lk_tot_set = 1; } - pdg += ngts; } if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples else lk_tot += call->theta; // the prior UPDATE_MAX_LKs(1<0 && lk_tot_set); } - grp_t *grps = &call->smpl_grp; - // Two alleles if ( nals>1 ) { for (ia=0; iangrp==1 && grps->grp[0].qsum[ia]==0 ) continue; + if ( grp->qsum[ia]==0 ) continue; int iaa = (ia+1)*(ia+2)/2-1; for (ib=0; ibngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; + if ( grp->qsum[ib]==0 ) continue; double lk_tot = 0; int lk_tot_set = 0; - int ia_cov = 0, ib_cov = 0; - for (j=0; jngrp; j++) + double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib]); + double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib]); + double fa2 = fa*fa; + double fb2 = fb*fb; + double fab = 2*fa*fb; + int is, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; + for (is=0; isgrp[j]; - if ( grp->qsum[ia] ) ia_cov = 1; - if ( grp->qsum[ib] ) ib_cov = 1; - if ( !grp->qsum[ia] && !grp->qsum[ib] ) { grp->dp = 0; continue; } - grp->dp = 1; - grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]); - grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]); - grp->fa2 = grp->fa*grp->fa; - grp->fb2 = grp->fb*grp->fb; - grp->fab = 2*grp->fa*grp->fb; - } - if ( !ia_cov || !ib_cov ) continue; - int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; - if ( !grp->dp ) continue; + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts; double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) - val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fab*pdg[iab]; - else if ( call->ploidy && call->ploidy[isample]==1 ) - val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb]; + if ( !call->ploidy || call->ploidy[ismpl]==2 ) + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab]; + else if ( call->ploidy && call->ploidy[ismpl]==1 ) + val = fa*pdg[iaa] + fb*pdg[ibb]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; @@ -702,50 +663,38 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) { for (ia=0; iangrp==1 && grps->grp[0].qsum[ia]==0 ) continue; + if ( grp->qsum[ia]==0 ) continue; int iaa = (ia+1)*(ia+2)/2-1; for (ib=0; ibngrp==1 && grps->grp[0].qsum[ib]==0 ) continue; + if ( grp->qsum[ib]==0 ) continue; int ibb = (ib+1)*(ib+2)/2-1; int iab = iaa - ia + ib; for (ic=0; icngrp==1 && grps->grp[0].qsum[ic]==0 ) continue; + if ( grp->qsum[ic]==0 ) continue; double lk_tot = 0; - int lk_tot_set = 1; - int ia_cov = 0, ib_cov = 0, ic_cov = 0; - for (j=0; jngrp; j++) - { - grp1_t *grp = &grps->grp[j]; - if ( grp->qsum[ia] ) ia_cov = 1; - if ( grp->qsum[ib] ) ib_cov = 1; - if ( grp->qsum[ic] ) ic_cov = 1; - if ( !grp->qsum[ia] && !grp->qsum[ib] && !grp->qsum[ic] ) { grp->dp = 0; continue; } - grp->dp = 1; - grp->fa = grp->qsum[ia]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fb = grp->qsum[ib]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fc = grp->qsum[ic]/(grp->qsum[ia]+grp->qsum[ib]+grp->qsum[ic]); - grp->fa2 = grp->fa*grp->fa; - grp->fb2 = grp->fb*grp->fb; - grp->fc2 = grp->fc*grp->fc; - grp->fab = 2*grp->fa*grp->fb, grp->fac = 2*grp->fa*grp->fc, grp->fbc = 2*grp->fb*grp->fc; - } - if ( !ia_cov || !ib_cov || !ic_cov ) continue; - int isample, icc = (ic+1)*(ic+2)/2-1; + int lk_tot_set = 0; + + double fa = grp->qsum[ia]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fb = grp->qsum[ib]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fc = grp->qsum[ic]/(grp->qsum[ia] + grp->qsum[ib] + grp->qsum[ic]); + double fa2 = fa*fa; + double fb2 = fb*fb; + double fc2 = fc*fc; + double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; + int is, icc = (ic+1)*(ic+2)/2-1; int iac = iaa - ia + ic, ibc = ibb - ib + ic; - double *pdg = call->pdg; - for (isample=0; isamplegrp[grps->smpl2grp[isample]]; - if ( !grp->dp ) continue; + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts; double val = 0; - if ( !call->ploidy || call->ploidy[isample]==2 ) - val = grp->fa2*pdg[iaa] + grp->fb2*pdg[ibb] + grp->fc2*pdg[icc] + grp->fab*pdg[iab] + grp->fac*pdg[iac] + grp->fbc*pdg[ibc]; - else if ( call->ploidy && call->ploidy[isample]==1 ) - val = grp->fa*pdg[iaa] + grp->fb*pdg[ibb] + grp->fc*pdg[icc]; + if ( !call->ploidy || call->ploidy[ismpl]==2 ) + val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc]; + else if ( call->ploidy && call->ploidy[ismpl]==1 ) + val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc]; if ( val ) { lk_tot += log(val); lk_tot_set = 1; } - pdg += ngts; } if ( ia!=0 ) lk_tot += call->theta; // the prior if ( ib!=0 ) lk_tot += call->theta; // the prior @@ -756,25 +705,26 @@ static int mcall_find_best_alleles(call_t *call, int nals, int *out_als) } } - call->ref_lk = ref_lk; - call->lk_sum = lk_sum; - *out_als = max_als; - int i, n = 0; for (i=0; imax_lk = max_lk; + grp->ref_lk = ref_lk; + grp->lk_sum = lk_sum; + grp->als = max_als; + grp->nals = n; + return n; } -static void mcall_set_ref_genotypes(call_t *call, int nals) +// Sets GT=0/0 or GT=. if PL=0,0,0 +static void mcall_set_ref_genotypes(call_t *call, int nals_ori) { int i; - int ngts = nals*(nals+1)/2; + int ngts = nals_ori*(nals_ori+1)/2; // need this to distinguish between GT=0/0 vs GT=. int nsmpl = bcf_hdr_nsamples(call->hdr); - for (i=0; iac[i] = 0; - call->nhets = 0; - call->ndiploid = 0; + for (i=0; iac[i] = 0; // nals_new<=nals_ori, never mind setting extra 0's // Set all genotypes to 0/0 or 0 int *gts = call->gts; @@ -800,34 +750,27 @@ static void mcall_set_ref_genotypes(call_t *call, int nals) } } -static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp) { int ia, ib, i; - int ngts = nals*(nals+1)/2; - int nsmpl = bcf_hdr_nsamples(call->hdr); - int nout_gts = nout_als*(nout_als+1)/2; - hts_expand(float,nout_gts*nsmpl,call->nGPs,call->GPs); - - for (i=0; iac[i] = 0; - call->nhets = 0; - call->ndiploid = 0; + int ngts_ori = nals_ori*(nals_ori+1)/2; + int ngts_new = call->nals_new*(call->nals_new+1)/2; + int nsmpl = grp->nsmpl; #if USE_PRIOR_FOR_GTS float prior = exp(call->theta); #endif - float *gps = call->GPs - nout_gts; - double *pdg = call->pdg - ngts; - int *gts = call->gts - 2; - int isample; - for (isample = 0; isample < nsmpl; isample++) + int is; + for (is = 0; is < nsmpl; is++) { - int ploidy = call->ploidy ? call->ploidy[isample] : 2; - assert( ploidy>=0 && ploidy<=2 ); + int ismpl = grp->smpl[is]; + double *pdg = call->pdg + ismpl*ngts_ori; + float *gps = call->GPs + ismpl*ngts_new; + int *gts = call->gts + ismpl*2; - pdg += ngts; - gts += 2; - gps += nout_gts; + int ploidy = call->ploidy ? call->ploidy[ismpl] : 2; + assert( ploidy>=0 && ploidy<=2 ); if ( !ploidy ) { @@ -839,8 +782,8 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a #if !FLAT_PDG_FOR_MISSING // Skip samples with zero depth, they have all pdg's equal to 0 - for (i=0; indiploid++; - // Default fallback for the case all LKs are the same gts[0] = bcf_gt_unphased(0); gts[1] = ploidy==2 ? bcf_gt_unphased(0) : bcf_int32_vector_end; // Non-zero depth, determine the most likely genotype - grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[isample]]; double best_lk = 0; - for (ia=0; iaals & 1<qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; #if USE_PRIOR_FOR_GTS if ( ia!=0 ) lk *= prior; @@ -877,13 +817,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a if ( ploidy==2 ) { gts[1] = gts[0]; - for (ia=0; iaals & 1<als & 1<qsum[ia]*grp->qsum[ib]; #if USE_PRIOR_FOR_GTS @@ -900,7 +840,6 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a } } } - if ( gts[0] != gts[1] ) call->nhets++; } else gts[1] = bcf_int32_vector_end; @@ -908,55 +847,50 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a call->ac[ bcf_gt_allele(gts[0]) ]++; if ( gts[1]!=bcf_int32_vector_end ) call->ac[ bcf_gt_allele(gts[1]) ]++; } - if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) + if ( !(call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP)) ) return; + double max, sum; + for (is=0; isGPs + isample*nout_gts; + int ismpl = grp->smpl[is]; + float *gps = call->GPs + ismpl*ngts_new; - int nmax; - if ( call->ploidy ) - { - if ( call->ploidy[isample]==2 ) nmax = nout_gts; - else if ( call->ploidy[isample]==1 ) nmax = nout_als; - else nmax = 0; - } - else nmax = nout_gts; + int nmax; + if ( call->ploidy ) + { + if ( call->ploidy[ismpl]==2 ) nmax = ngts_new; + else if ( call->ploidy[ismpl]==1 ) nmax = grp->nals; + else nmax = 0; + } + else nmax = ngts_new; - max = gps[0]; - if ( max<0 || nmax==0 ) - { - // no call - if ( call->output_tags & CALL_FMT_GP ) - { - for (i=0; iGQs[isample] = 0; - continue; - } - sum = gps[0]; - for (i=1; iGQs[isample] = max<=INT8_MAX ? max : INT8_MAX; + max = gps[0]; + if ( max<0 || nmax==0 ) + { + // no call if ( call->output_tags & CALL_FMT_GP ) { - assert( max ); - for (i=0; iGQs[ismpl] = 0; + continue; + } + sum = gps[0]; + for (i=1; iGQs[ismpl] = max<=INT8_MAX ? max : INT8_MAX; + if ( call->output_tags & CALL_FMT_GP ) + { + assert( max ); + for (i=0; ioutput_tags & CALL_FMT_GP ) - bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*nout_gts); - if ( call->output_tags & CALL_FMT_GQ ) - bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl); } @@ -979,12 +913,13 @@ static void mcall_call_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_a Individual qualities are calculated as GQ(F=i,M=j,K=k) = P(F=i,M=j,K=k) / \sum_{x,y} P(F=i,M=x,K=y) */ -static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +#if 0 +static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int nals_new, int als_new) { int ia, ib, i; int nsmpl = bcf_hdr_nsamples(call->hdr); int ngts = nals*(nals+1)/2; - int nout_gts = nout_als*(nout_als+1)/2; + int nout_gts = nals_new*(nals_new+1)/2; double *gls = call->GLs - nout_gts; double *pdg = call->pdg - ngts; @@ -1014,7 +949,7 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n double best_lk = 0; for (ia=0; iaals_map[ia],call->als_map[ia]); double lk = ploidy==2 ? pdg[iaa]*grp->qsum[ia]*grp->qsum[ia] : pdg[iaa]*grp->qsum[ia]; @@ -1030,10 +965,10 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n { for (ia=0; iaals_map[ia],call->als_map[ib]); double lk = 2*pdg[iab]*grp->qsum[ia]*grp->qsum[ib]; @@ -1077,8 +1012,8 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n for (ifm=0; ifmnfams; ifm++) { family_t *fam = &call->fams[ifm]; - int ntrio = call->ntrio[fam->type][nout_als]; - uint16_t *trio = call->trio[fam->type][nout_als]; + int ntrio = call->ntrio[fam->type][nals_new]; + uint16_t *trio = call->trio[fam->type][nals_new]; // Unconstrained likelihood int uc_itr = 0; @@ -1226,11 +1161,12 @@ static void mcall_call_trio_genotypes(call_t *call, bcf1_t *rec, int nals, int n bcf_update_format_int32(call->hdr,rec,"CGT",call->cgts,nsmpl); } } +#endif -static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +static void mcall_trim_and_update_PLs(call_t *call, bcf1_t *rec, int nals_ori, int nals_new) { - int ngts = nals*(nals+1)/2; - int npls_src = ngts, npls_dst = nout_als*(nout_als+1)/2; // number of PL values in diploid samples, ori and new + int npls_src = nals_ori*(nals_ori+1)/2; + int npls_dst = nals_new*(nals_new+1)/2; // number of PL values in diploid samples, ori and new if ( call->all_diploid && npls_src == npls_dst ) return; int *pls_src = call->PLs, *pls_dst = call->PLs; @@ -1247,7 +1183,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in } else if ( ploidy==1 ) { - for (ia=0; iapl_map[isrc] ]; @@ -1257,7 +1193,7 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in else { pls_dst[0] = bcf_int32_missing; - pls_dst[1] = bcf_int32_vector_end; // relying on nout_als>1 in mcall() + pls_dst[1] = bcf_int32_vector_end; // relying on nals_new>1 in mcall() } pls_src += npls_src; pls_dst += npls_dst; @@ -1265,9 +1201,9 @@ static void mcall_trim_PLs(call_t *call, bcf1_t *rec, int nals, int nout_als, in bcf_update_format_int32(call->hdr, rec, "PL", call->PLs, npls_dst*nsmpl); } -void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als) +void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int nals_new) { - if ( nals==nout_als ) return; + if ( nals_ori==nals_new ) return; int i,j, nret, size = sizeof(float); @@ -1286,17 +1222,17 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type); if ( nret<=0 ) continue; - if ( nout_als==1 ) + if ( nals_new==1 ) bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change else { - for (j=0; jals_map[j]; if ( k==-1 ) continue; // to be dropped memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size); } - bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als); + bcf_update_info_int32(call->hdr, rec, key, tmp_new, nals_new); } } @@ -1313,21 +1249,21 @@ void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int o if (nret<=0) continue; int nsmpl = bcf_hdr_nsamples(call->hdr); - assert( nret==nals*nsmpl ); + assert( nret==nals_ori*nsmpl ); for (j=0; jals_map[k]; if ( l==-1 ) continue; // to be dropped memcpy(ptr_dst+size*l, ptr_src+size*k, size); } } - bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl); + bcf_update_format_int32(call->hdr, rec, key, tmp_new, nals_new*nsmpl); } call->PLs = (int32_t*) tmp_new; @@ -1442,12 +1378,12 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) } bcf_update_format_int32(call->hdr, rec, "PL", call->itmp, npls_new*nsmpl); - // update QS - int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); - hts_expand(float,nals,call->nqsum,call->qsum); + // update QS, use temporarily call->GPs to store the values + int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum); + hts_expand(float,nals,call->nGPs,call->GPs); for (i=0; iqsum[i] = call->als_map[i]smpl_grp.grp[0].qsum[call->als_map[i]] : 0; - bcf_update_info_float(call->hdr, rec, "QS", call->qsum, nals); + call->GPs[i] = call->als_map[i]smpl_grp[0].qsum[call->als_map[i]] : 0; + bcf_update_info_float(call->hdr, rec, "QS", call->GPs, nals); // update any Number=R tags void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point @@ -1488,7 +1424,6 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) call->itmp = (int32_t*) tmp_ori; call->n_itmp = ntmp_ori; - if ( *unseen ) *unseen = nals-1; return 0; } @@ -1507,203 +1442,229 @@ int mcall(call_t *call, bcf1_t *rec) // Force alleles when calling genotypes given alleles was requested if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2; - int nsmpl = bcf_hdr_nsamples(call->hdr); - int nals = rec->n_allele; - hts_expand(int,nals,call->nac,call->ac); - hts_expand(int,nals,call->nals_map,call->als_map); - hts_expand(int,nals*(nals+1)/2,call->npl_map,call->pl_map); + int nsmpl = bcf_hdr_nsamples(call->hdr); + int nals_ori = rec->n_allele; + hts_expand(int,nals_ori,call->nac,call->ac); + hts_expand(int,nals_ori,call->nals_map,call->als_map); + hts_expand(int,nals_ori*(nals_ori+1)/2,call->npl_map,call->pl_map); // Get the genotype likelihoods call->nPLs = bcf_get_format_int32(call->hdr, rec, "PL", &call->PLs, &call->mPLs); - if ( call->nPLs!=nsmpl*nals*(nals+1)/2 && call->nPLs!=nsmpl*nals ) // a mixture of diploid and haploid or haploid only - error("Wrong number of PL fields? nals=%d npl=%d\n", nals,call->nPLs); + if ( call->nPLs!=nsmpl*nals_ori*(nals_ori+1)/2 && call->nPLs!=nsmpl*nals_ori ) // a mixture of diploid and haploid or haploid only + error("Wrong number of PL fields? nals=%d npl=%d\n", nals_ori,call->nPLs); // Convert PLs to probabilities - int ngts = nals*(nals+1)/2; + int ngts_ori = nals_ori*(nals_ori+1)/2; hts_expand(double, call->nPLs, call->npdg, call->pdg); - set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts, unseen); + set_pdg(call->pl2p, call->PLs, call->pdg, nsmpl, ngts_ori, unseen); // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes. - if ( call->smpl_grp.ngrp == 1 ) + if ( call->nsmpl_grp == 1 ) { - int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp.grp[0].qsum, &call->smpl_grp.grp[0].nqsum); + int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->smpl_grp[0].qsum, &call->smpl_grp[0].nqsum); if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1); - if ( nqs < nals ) + if ( nqs < nals_ori ) { // Some of the listed alleles do not have the corresponding QS field. This is // typically ref-only site with <*> in ALT. - hts_expand(float,nals,call->smpl_grp.grp[0].nqsum,call->smpl_grp.grp[0].qsum); - for (i=nqs; ismpl_grp.grp[0].qsum[i] = 0; + hts_expand(float,nals_ori,call->smpl_grp[0].nqsum,call->smpl_grp[0].qsum); + for (i=nqs; ismpl_grp[0].qsum[i] = 0; } } else { - for (j=0; jsmpl_grp.ngrp; j++) + for (j=0; jnsmpl_grp; j++) { - hts_expand(float,nals,call->smpl_grp.grp[j].nqsum,call->smpl_grp.grp[j].qsum); - memset(call->smpl_grp.grp[j].qsum, 0, sizeof(float)*nals); + hts_expand(float,nals_ori,call->smpl_grp[j].nqsum,call->smpl_grp[j].qsum); + memset(call->smpl_grp[j].qsum, 0, sizeof(float)*nals_ori); } - int nad = bcf_get_format_int32(call->hdr, rec, "AD", &call->ADs, &call->nADs); - if ( nad<1 ) error("Error: FORMAT/AD is required with the -G option, mpileup must be run with -a AD\n"); + // Use FORMAT/AD or FORMAT/QS + int nad = bcf_get_format_int32(call->hdr, rec, call->sample_groups_tag, &call->ADs, &call->nADs); + if ( nad<1 ) error("Error: FORMAT/%s is required with the -G option, mpileup must be run with \"-a AD\" or \"-a QS\"\n",call->sample_groups_tag); nad /= bcf_hdr_nsamples(call->hdr); - hts_expand(float,nals,call->nqsum,call->qsum); - float qsum = 0; - for (i=0; ihdr); i++) + for (i=0; insmpl_grp; i++) { - int32_t *ptr = call->ADs + i*nad; - for (j=0; jsmpl_grp[i]; + hts_expand(float,nals_ori,grp->nqsum,grp->qsum); + for (j=0; jqsum[j] = 0; + for (is=0; isnsmpl; is++) { - if ( ptr[j]==bcf_int32_vector_end ) break; - if ( ptr[j]==bcf_int32_missing ) call->qsum[j] = 0; - else { call->qsum[j] = ptr[j]; qsum += ptr[j]; } + int ismpl = grp->smpl[is]; + int32_t *ptr = call->ADs + ismpl*nad; + float sum = 0; + for (j=0; jqsum[j] += ptr[j]/sum; + } + } } - for (; jqsum[j] = 0; - if ( qsum ) - for (j=0; jqsum[j] /= qsum; - - grp1_t *grp = &call->smpl_grp.grp[call->smpl_grp.smpl2grp[i]]; - for (j=0; jqsum[j] += call->qsum[j]; } } // If available, take into account reference panel AFs if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 ) { - int an = call->ac[0]; - if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 ) + int an = call->ac[0]; // number of alleles total, procede only if not zero; reuse call->ac + if ( an > 0 && bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals_ori-1 ) // number of ALT alleles { - int ac0 = an; // number of alleles in the reference population - for (i=0; iac[i]==bcf_int32_vector_end ) break; if ( call->ac[i]==bcf_int32_missing ) continue; ac0 -= call->ac[i]; - for (j=0; jsmpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[i+1] += call->ac[i]*0.5; + + // here an*0.5 is the number of samples in the populatio and ac*0.5 is the AF weighted by the number of samples + for (j=0; jnsmpl_grp; j++) + call->smpl_grp[j].qsum[i+1] = (call->smpl_grp[j].qsum[i+1] + 0.5*call->ac[i]) / (call->smpl_grp[j].nsmpl + 0.5*an); } if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1); - for (j=0; jsmpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[0] += ac0*0.5; - for (i=0; ismpl_grp.ngrp; j++) - call->smpl_grp.grp[j].qsum[i] /= nsmpl + 0.5*an; - } + for (j=0; jnsmpl_grp; j++) + call->smpl_grp[j].qsum[0] = (call->smpl_grp[j].qsum[0] + 0.5*ac0) / (call->smpl_grp[j].nsmpl + 0.5*an); } } - for (j=0; jsmpl_grp.ngrp; j++) + // normalize so that QS sums to 1 for each group + for (j=0; jnsmpl_grp; j++) { - float qsum_tot = 0; - for (i=0; ismpl_grp.grp[j].qsum[i]; - if ( qsum_tot ) for (i=0; ismpl_grp.grp[j].qsum[i] /= qsum_tot; + float sum = 0; + for (i=0; ismpl_grp[j].qsum[i]; + if ( sum ) for (i=0; ismpl_grp[j].qsum[i] /= sum; } bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag - // Find the best combination of alleles - int out_als, nout; - if ( nals > 8*sizeof(out_als) ) + if ( nals_ori > 8*sizeof(call->als_new) ) { fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); return 0; } - nout = mcall_find_best_alleles(call, nals, &out_als); - // Make sure the REF allele is always present - if ( !(out_als&1) ) + // For each group find the best combination of alleles + call->als_new = 0; + double ref_lk = -HUGE_VAL, lk_sum = -HUGE_VAL, max_qual = -HUGE_VAL; + for (j=0; jnsmpl_grp; j++) { - out_als |= 1; - nout++; + smpl_grp_t *grp = &call->smpl_grp[j]; + mcall_find_best_alleles(call, nals_ori, grp); + call->als_new |= grp->als; + if ( grp->max_lk==-HUGE_VAL ) continue; + double qual = -4.343*(grp->ref_lk - logsumexp2(grp->lk_sum,grp->ref_lk)); + if ( max_qual < qual ) + { + max_qual = qual; + lk_sum = grp->lk_sum; + ref_lk = grp->ref_lk; + } } - int is_variant = out_als==1 ? 0 : 1; + + // Make sure the REF allele is always present + if ( !(call->als_new&1) ) call->als_new |= 1; + + int is_variant = call->als_new==1 ? 0 : 1; if ( call->flag & CALL_VARONLY && !is_variant ) return 0; - // With -A, keep all ALTs except X - if ( call->flag & CALL_KEEPALT ) + call->nals_new = 0; + for (i=0; i0 && i==unseen ) continue; - out_als |= 1<0 && i==unseen ) continue; + if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<als_new & (1<nals_new++; } + init_allele_trimming_maps(call,nals_ori,call->als_new); + int nAC = 0; - if ( out_als==1 ) // only REF allele on output + if ( call->als_new==1 ) // only REF allele on output { - init_allele_trimming_maps(call, 1, nals); - mcall_set_ref_genotypes(call,nals); + mcall_set_ref_genotypes(call,nals_ori); bcf_update_format_int32(call->hdr, rec, "PL", NULL, 0); // remove PL, useless now } + else if ( !is_variant ) + { + mcall_set_ref_genotypes(call,nals_ori); // running with -A, prevent mcall_call_genotypes from putting some ALT back + mcall_trim_and_update_PLs(call, rec, nals_ori, call->nals_new); + } else { // The most likely set of alleles includes non-reference allele (or was enforced), call genotypes. // Note that it is a valid outcome if the called genotypes exclude some of the ALTs. - init_allele_trimming_maps(call, out_als, nals); - if ( !is_variant ) - mcall_set_ref_genotypes(call,nals); // running with -A, prevent mcall_call_genotypes from putting some ALT back - else if ( call->flag & CALL_CONSTR_TRIO ) + int ngts_new = call->nals_new*(call->nals_new+1)/2; + hts_expand(float,ngts_new*nsmpl,call->nGPs,call->GPs); + for (i=0; inals_new; i++) call->ac[i] = 0; + + if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 ) + { + fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); + return 0; + } + if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) { - if ( nout>4 ) - { - fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; - } - mcall_call_trio_genotypes(call, rec, nals,nout,out_als); + memset(call->GPs,0,nsmpl*ngts_new*sizeof(*call->GPs)); + memset(call->GQs,0,nsmpl*sizeof(*call->GQs)); + } + for (i=0; insmpl_grp; i++) + { + if ( call->flag & CALL_CONSTR_TRIO ) + error("todo: constrained trio calling temporarily disabled\n"); //mcall_call_trio_genotypes(call,rec,nals,&call->smpl_grp[i]); + else + mcall_call_genotypes(call,nals_ori,&call->smpl_grp[i]); } - else - mcall_call_genotypes(call,rec,nals,nout,out_als); // Skip the site if all samples are 0/0. This can happen occasionally. - nAC = 0; - for (i=1; iac[i]; + for (i=1; inals_new; i++) nAC += call->ac[i]; if ( !nAC && call->flag & CALL_VARONLY ) return 0; - mcall_trim_PLs(call, rec, nals, nout, out_als); + + if ( call->output_tags & CALL_FMT_GP ) + bcf_update_format_float(call->hdr, rec, "GP", call->GPs, nsmpl*ngts_new); + if ( call->output_tags & CALL_FMT_GQ ) + bcf_update_format_int32(call->hdr, rec, "GQ", call->GQs, nsmpl); + + mcall_trim_and_update_PLs(call,rec,nals_ori,call->nals_new); } - if ( nals!=nout ) mcall_trim_numberR(call, rec, nals, nout, out_als); + if ( nals_ori!=call->nals_new ) + mcall_trim_and_update_numberR(call,rec,nals_ori,call->nals_new); - // Set QUAL and calculate HWE-related annotations + // Set QUAL if ( nAC ) { - float icb = calc_ICB(call->ac[0],nAC, call->nhets, call->ndiploid); - if ( icb != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "ICB", &icb, 1); - - float hob = calc_HOB(call->ac[0],nAC, call->nhets, call->ndiploid); - if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1); - // Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set - rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk)); + rec->qual = max_qual; } else { // Set the quality of a REF site - if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + if ( lk_sum!=-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior + rec->qual = -4.343*(lk_sum - logsumexp2(lk_sum,ref_lk)); + else if ( call->ac[0] ) rec->qual = call->theta ? -4.343*call->theta : 0; else - rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk)); + bcf_float_set_missing(rec->qual); } - if ( rec->qual>999 ) rec->qual = 999; - if ( rec->qual>50 ) rec->qual = rint(rec->qual); - // AC, AN - if ( nout>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, nout-1); + if ( call->nals_new>1 ) bcf_update_info_int32(call->hdr, rec, "AC", call->ac+1, call->nals_new-1); nAC += call->ac[0]; bcf_update_info_int32(call->hdr, rec, "AN", &nAC, 1); // Remove unused alleles - hts_expand(char*,nout,call->nals,call->als); - for (i=0; inals_new,call->nals,call->als); + for (i=0; ials_map[i]>=0 ) call->als[call->als_map[i]] = rec->d.allele[i]; - bcf_update_alleles(call->hdr, rec, (const char**)call->als, nout); + bcf_update_alleles(call->hdr, rec, (const char**)call->als, call->nals_new); bcf_update_genotypes(call->hdr, rec, call->gts, nsmpl*2); - // DP4 tag + // DP4 and PV4 tags if ( bcf_get_info_float(call->hdr, rec, "I16", &call->anno16, &call->n16)==16 ) { int32_t dp[4]; dp[0] = call->anno16[0]; dp[1] = call->anno16[1]; dp[2] = call->anno16[2]; dp[3] = call->anno16[3]; @@ -1711,10 +1672,22 @@ int mcall(call_t *call, bcf1_t *rec) int32_t mq = (call->anno16[8]+call->anno16[10])/(call->anno16[0]+call->anno16[1]+call->anno16[2]+call->anno16[3]); bcf_update_info_int32(call->hdr, rec, "MQ", &mq, 1); + + if ( call->output_tags & CALL_FMT_PV4 ) + { + anno16_t a; + float tmpf[4]; + int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0; + if ( is_tested ) + { + for (i=0; i<4; i++) tmpf[i] = a.p[i]; + bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4); + } + } } bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag - return nout; + return call->nals_new; } diff --git a/test/call-G.1.out b/test/call-G.1.out new file mode 100644 index 000000000..171313184 --- /dev/null +++ b/test/call-G.1.out @@ -0,0 +1,17 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=test.fna +##contig= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl1 smpl2 smpl3 smpl4 smpl5 smpl6 smpl7 smpl8 smpl9 smpl10 smpl11 smpl12 smpl13 smpl14 smpl15 smpl16 smpl17 smpl18 smpl19 smpl20 smpl21 smpl22 smpl23 smpl24 smpl25 smpl26 culprit smpl28 smpl29 smpl30 smpl31 smpl32 smpl33 +1 378 . G A,T 461.915 . DP=1418;AN_POP=1000;AC_POP=500,0,0;AC=5,5;AN=48;DP4=675,356,36,17;MQ=57 GT:PL:AD:QS 0/0:0,3,60,3,60,60:1,0,0:60,0,0 0/0:0,33,255,33,255,255:11,0,0:425,0,0 0/2:98,119,255,0,173,155:7,0,6:224,0,175 0/0:0,12,155,12,155,155:4,0,0:174,0,0 0/0:0,24,244,24,244,244:8,0,0:324,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 0/2:37,37,37,3,3,0:0,0,1:0,0,37 0/0:0,15,161,15,161,161:5,0,0:187,0,0 0/0:0,15,192,15,192,192:5,0,0:219,0,0 0/0:0,6,94,6,94,94:2,0,0:101,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 0/0:0,27,255,27,255,255:9,0,0:399,0,0 0/1:31,0,19,34,22,53:1,1,0:25,37,0 0/0:0,18,165,18,165,165:6,0,0:222,0,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 0/1:49,3,0,49,3,49:0,1,0:0,49,0 ./.:0,0,0,0,0,0:0,0,0:0,0,0 0/0:0,3,41,3,41,41:1,0,0:41,0,0 0/0:0,39,255,39,255,255:13,0,0:502,0,0 0/0:11,0,227,38,230,255:9,1,0:399,41,0 0/0:0,3,41,3,41,41:1,0,0:41,0,0 1/2:218,98,86,135,0,126:0,4,3:0,160,112 0/0:0,21,205,21,205,205:7,0,0:250,0,0 0/0:0,18,200,18,200,200:6,0,0:261,0,0 0/2:82,91,200,0,119,113:3,0,2:133,0,97 0/0:0,9,137,9,137,137:3,0,0:161,0,0 0/1:73,38,35,73,38,73:0,1,0:0,41,0 1/2:107,19,47,57,0,88:1,3,1:41,123,60 diff --git a/test/call-G.2.1.out b/test/call-G.2.1.out new file mode 100644 index 000000000..1c5a720f7 --- /dev/null +++ b/test/call-G.2.1.out @@ -0,0 +1,27 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://hwe.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07000 NA07056 NA12046 NA12144 NA12156 NA12234 NA12249 NA12282 NA12283 NA12286 NA12340 NA12341 NA12342 NA12399 NA12414 NA12546 NA12717 NA12718 NA12748 NA12749 NA12750 NA12776 NA12828 NA12843 NA18502 NA18867 NA18908 NA18933 NA18934 NA19108 NA19117 NA19119 NA19129 NA19131 NA19138 NA19147 NA19149 NA19159 NA19185 NA19189 NA19197 NA19198 NA19206 NA19213 NA19214 NA19223 NA19235 NA19236 NA19238 NA19239 NA19248 NA19256 +15 201 . A G 858.4 . DP=80;AN_POP=1000;AC_POP=1000,0;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;AC=79;AN=104;DP4=19,17,21,15;MQ=59 GT:PL:AD 0/1:0,3,36:1,0 0/1:0,6,93:2,0 0/1:0,3,33:1,0 0/1:0,3,32:1,0 0/1:0,3,36:1,0 0/1:0,3,32:1,0 0/1:0,3,40:1,0 0/1:0,6,79:2,0 0/1:0,3,42:1,0 0/1:0,6,64:2,0 0/1:0,6,67:2,0 0/1:0,6,66:2,0 0/1:0,9,96:3,0 0/1:0,3,32:1,0 0/1:0,3,36:1,0 0/1:0,3,31:1,0 0/1:0,3,44:1,0 0/1:0,3,44:1,0 0/1:0,6,61:2,0 0/1:0,3,60:1,0 0/1:0,6,60:2,0 0/1:0,6,57:2,0 0/1:0,3,31:1,0 0/1:0,6,68:2,0 0/1:0,3,60:1,0 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:37,3,0:0,1 1/1:60,3,0:0,1 1/1:38,3,0:0,1 1/1:36,3,0:0,1 1/1:37,3,0:0,1 1/1:85,9,0:0,3 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:63,6,0:0,2 1/1:29,3,0:0,1 1/1:32,3,0:0,1 1/1:43,3,0:0,1 1/1:27,3,0:0,1 1/1:43,3,0:0,1 1/1:34,3,0:0,1 1/1:46,3,0:0,1 1/1:144,12,0:0,4 1/1:59,3,0:0,1 1/1:16,3,0:0,1 1/1:37,3,0:0,1 1/1:57,6,0:0,2 1/1:91,6,0:0,2 1/1:21,3,0:0,1 1/1:52,6,0:0,2 1/1:26,3,0:0,1 diff --git a/test/call-G.2.out b/test/call-G.2.out new file mode 100644 index 000000000..8709e0515 --- /dev/null +++ b/test/call-G.2.out @@ -0,0 +1,17 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=test.fna +##contig= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl1 smpl2 smpl3 smpl4 smpl5 smpl6 smpl7 smpl8 smpl9 smpl10 smpl11 smpl12 smpl13 smpl14 smpl15 smpl16 smpl17 smpl18 smpl19 smpl20 smpl21 smpl22 smpl23 smpl24 smpl25 smpl26 culprit smpl28 smpl29 smpl30 smpl31 smpl32 smpl33 +1 378 . G A,T,C 169.281 . DP=1418;AN_POP=1000;AC_POP=500,0,0;AC=6,5,1;AN=48;DP4=675,356,36,17;MQ=57 GT:PL:AD:QS 0/0:0,3,60,3,60,60,3,60,60,60:1,0,0,0:60,0,0,0 0/0:0,33,255,33,255,255,33,255,255,255:11,0,0,0:425,0,0,0 0/2:98,119,255,0,173,155,119,255,173,255:7,0,6,0:224,0,175,0 0/0:0,12,155,12,155,155,12,155,155,155:4,0,0,0:174,0,0,0 0/0:0,24,244,24,244,244,24,244,244,244:8,0,0,0:324,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 2/2:37,37,37,3,3,0,37,37,3,37:0,0,1,0:0,0,37,0 0/0:0,15,161,15,161,161,15,161,161,161:5,0,0,0:187,0,0,0 0/0:0,15,192,15,192,192,15,192,192,192:5,0,0,0:219,0,0,0 0/0:0,6,94,6,94,94,6,94,94,94:2,0,0,0:101,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0/0:0,27,255,27,255,255,27,255,255,255:9,0,0,0:399,0,0,0 0/1:31,0,19,34,22,53,34,22,53,53:1,1,0,0:25,37,0,0 0/0:0,18,165,18,165,165,18,165,165,165:6,0,0,0:222,0,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 1/1:49,3,0,49,3,49,49,3,49,49:0,1,0,0:0,49,0,0 ./.:0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0/0:0,3,41,3,41,41,3,41,41,41:1,0,0,0:41,0,0,0 0/0:0,39,255,39,255,255,39,255,255,255:13,0,0,0:502,0,0,0 0/0:11,0,227,38,230,255,38,230,255,255:9,1,0,0:399,41,0,0 0/0:0,3,41,3,41,41,3,41,41,41:1,0,0,0:41,0,0,0 1/2:218,98,86,135,0,126,218,98,135,218:0,4,3,0:0,160,112,0 0/0:0,21,205,21,205,205,21,205,205,205:7,0,0,0:250,0,0,0 0/0:0,18,200,18,200,200,18,200,200,200:6,0,0,0:261,0,0,0 0/2:82,91,200,0,119,113,91,200,119,200:3,0,2,0:133,0,97,0 0/0:0,9,137,9,137,137,9,137,137,137:3,0,0,0:161,0,0,0 1/3:73,38,35,73,38,73,38,0,38,35:0,1,0,1:0,41,0,41 0/1:107,19,47,57,0,88,110,56,91,141:1,3,1,0:41,123,60,0 diff --git a/test/call-G.2.vcf b/test/call-G.2.vcf new file mode 100644 index 000000000..531e3bd23 --- /dev/null +++ b/test/call-G.2.vcf @@ -0,0 +1,24 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://hwe.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07000 NA07056 NA12046 NA12144 NA12156 NA12234 NA12249 NA12282 NA12283 NA12286 NA12340 NA12341 NA12342 NA12399 NA12414 NA12546 NA12717 NA12718 NA12748 NA12749 NA12750 NA12776 NA12828 NA12843 NA18502 NA18867 NA18908 NA18933 NA18934 NA19108 NA19117 NA19119 NA19129 NA19131 NA19138 NA19147 NA19149 NA19159 NA19185 NA19189 NA19197 NA19198 NA19206 NA19213 NA19214 NA19223 NA19235 NA19236 NA19238 NA19239 NA19248 NA19256 +15 201 . A G,<*> 0 . DP=80;AN_POP=1000;AC_POP=1000,0;I16=19,17,21,15,1380,57328,1308,52148,2160,129600,2137,127369,613,12599,514,10028;QS=25,27,0;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0 PL:AD 0,3,36,3,36,36:1,0,0 0,6,93,6,93,93:2,0,0 0,3,33,3,33,33:1,0,0 0,3,32,3,32,32:1,0,0 0,3,36,3,36,36:1,0,0 0,3,32,3,32,32:1,0,0 0,3,40,3,40,40:1,0,0 0,6,79,6,79,79:2,0,0 0,3,42,3,42,42:1,0,0 0,6,64,6,64,64:2,0,0 0,6,67,6,67,67:2,0,0 0,6,66,6,66,66:2,0,0 0,9,96,9,96,96:3,0,0 0,3,32,3,32,32:1,0,0 0,3,36,3,36,36:1,0,0 0,3,31,3,31,31:1,0,0 0,3,44,3,44,44:1,0,0 0,3,44,3,44,44:1,0,0 0,6,61,6,61,61:2,0,0 0,3,60,3,60,60:1,0,0 0,6,60,6,60,60:2,0,0 0,6,57,6,57,57:2,0,0 0,3,31,3,31,31:1,0,0 0,6,68,6,68,68:2,0,0 0,3,60,3,60,60:1,0,0 39,3,0,39,3,39:0,1,0 31,3,0,31,3,31:0,1,0 37,3,0,37,3,37:0,1,0 60,3,0,60,3,60:0,1,0 38,3,0,38,3,38:0,1,0 36,3,0,36,3,36:0,1,0 37,3,0,37,3,37:0,1,0 85,9,0,85,9,85:0,3,0 39,3,0,39,3,39:0,1,0 31,3,0,31,3,31:0,1,0 63,6,0,63,6,63:0,2,0 29,3,0,29,3,29:0,1,0 32,3,0,32,3,32:0,1,0 43,3,0,43,3,43:0,1,0 27,3,0,27,3,27:0,1,0 43,3,0,43,3,43:0,1,0 34,3,0,34,3,34:0,1,0 46,3,0,46,3,46:0,1,0 144,12,0,144,12,144:0,4,0 59,3,0,59,3,59:0,1,0 16,3,0,16,3,16:0,1,0 37,3,0,37,3,37:0,1,0 57,6,0,57,6,57:0,2,0 91,6,0,91,6,91:0,2,0 21,3,0,21,3,21:0,1,0 52,6,0,52,6,52:0,2,0 26,3,0,26,3,26:0,1,0 diff --git a/test/call-G.vcf b/test/call-G.vcf new file mode 100644 index 000000000..d90044735 --- /dev/null +++ b/test/call-G.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##reference=test.fna +##contig= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl1 smpl2 smpl3 smpl4 smpl5 smpl6 smpl7 smpl8 smpl9 smpl10 smpl11 smpl12 smpl13 smpl14 smpl15 smpl16 smpl17 smpl18 smpl19 smpl20 smpl21 smpl22 smpl23 smpl24 smpl25 smpl26 culprit smpl28 smpl29 smpl30 smpl31 smpl32 smpl33 +1 378 . G A,T,C 0 . DP=1418;AN_POP=1000;AC_POP=500,0,0;I16=675,356,36,17,49274,2.62616e+06,2313,113051,58808,3.52801e+06,3000,180000,21513,499753,1170,27336;QS=247.89,7.1438,4.46622,0.5 PL:AD:QS 0,3,60,3,60,60,3,60,60,60:1,0,0,0:60,0,0,0 0,33,255,33,255,255,33,255,255,255:11,0,0,0:425,0,0,0 98,119,255,0,173,155,119,255,173,255:7,0,6,0:224,0,175,0 0,12,155,12,155,155,12,155,155,155:4,0,0,0:174,0,0,0 0,24,244,24,244,244,24,244,244,244:8,0,0,0:324,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 37,37,37,3,3,0,37,37,3,37:0,0,1,0:0,0,37,0 0,15,161,15,161,161,15,161,161,161:5,0,0,0:187,0,0,0 0,15,192,15,192,192,15,192,192,192:5,0,0,0:219,0,0,0 0,6,94,6,94,94,6,94,94,94:2,0,0,0:101,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0,27,255,27,255,255,27,255,255,255:9,0,0,0:399,0,0,0 31,0,19,34,22,53,34,22,53,53:1,1,0,0:25,37,0,0 0,18,165,18,165,165,18,165,165,165:6,0,0,0:222,0,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 49,3,0,49,3,49,49,3,49,49:0,1,0,0:0,49,0,0 0,0,0,0,0,0,0,0,0,0:0,0,0,0:0,0,0,0 0,3,41,3,41,41,3,41,41,41:1,0,0,0:41,0,0,0 0,39,255,39,255,255,39,255,255,255:13,0,0,0:502,0,0,0 11,0,227,38,230,255,38,230,255,255:9,1,0,0:399,41,0,0 0,3,41,3,41,41,3,41,41,41:1,0,0,0:41,0,0,0 218,98,86,135,0,126,218,98,135,218:0,4,3,0:0,160,112,0 0,21,205,21,205,205,21,205,205,205:7,0,0,0:250,0,0,0 0,18,200,18,200,200,18,200,200,200:6,0,0,0:261,0,0,0 82,91,200,0,119,113,91,200,119,200:3,0,2,0:133,0,97,0 0,9,137,9,137,137,9,137,137,137:3,0,0,0:161,0,0,0 73,38,35,73,38,73,38,0,38,35:0,1,0,1:0,41,0,41 107,19,47,57,0,88,110,56,91,141:1,3,1,0:41,123,60,0 diff --git a/test/call.af-fixation.1.out b/test/call.af-fixation.1.out new file mode 100644 index 000000000..ede414aa5 --- /dev/null +++ b/test/call.af-fixation.1.out @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07051_CEU NA12843_CEU NA18488_YRI NA18489_YRI NA18498_YRI NA18504_YRI NA18510_YRI NA18519_YRI NA18858_YRI NA18861_YRI NA18868_YRI NA18871_YRI NA18874_YRI NA18881_YRI NA18907_YRI NA18909_YRI NA18917_YRI NA18933_YRI NA18934_YRI NA19108_YRI NA19119_YRI NA19129_YRI NA19131_YRI NA19138_YRI NA19147_YRI NA19189_YRI NA19197_YRI NA19198_YRI NA19206_YRI NA19214_YRI NA19223_YRI NA19238_YRI NA19239_YRI NA19248_YRI NA19256_YRI +2 136608646 . G . 24.5121 . AN=70;DP4=95,24,45,11;MQ=59 GT:AD 0/0:0 0/0:1 0/0:2 0/0:1 0/0:2 0/0:2 0/0:2 0/0:1 0/0:7 0/0:8 0/0:2 0/0:1 0/0:2 0/0:6 0/0:7 0/0:4 0/0:7 0/0:3 0/0:2 0/0:5 0/0:1 0/0:4 0/0:1 0/0:4 0/0:2 0/0:6 0/0:1 0/0:2 0/0:1 0/0:1 0/0:1 0/0:4 0/0:2 0/0:1 0/0:2 diff --git a/test/call.af-fixation.2.out b/test/call.af-fixation.2.out new file mode 100644 index 000000000..de69f88c9 --- /dev/null +++ b/test/call.af-fixation.2.out @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07051_CEU NA12843_CEU NA18488_YRI NA18489_YRI NA18498_YRI NA18504_YRI NA18510_YRI NA18519_YRI NA18858_YRI NA18861_YRI NA18868_YRI NA18871_YRI NA18874_YRI NA18881_YRI NA18907_YRI NA18909_YRI NA18917_YRI NA18933_YRI NA18934_YRI NA19108_YRI NA19119_YRI NA19129_YRI NA19131_YRI NA19138_YRI NA19147_YRI NA19189_YRI NA19197_YRI NA19198_YRI NA19206_YRI NA19214_YRI NA19223_YRI NA19238_YRI NA19239_YRI NA19248_YRI NA19256_YRI +2 136608646 . G A 4.92443 . AC=2;AN=70;DP4=95,24,45,11;MQ=59 GT:PL:AD 0/1:32,3,0:0,1 0/1:0,3,37:1,0 0/0:0,6,77:2,0 0/0:0,3,33:1,0 0/0:0,6,67:2,0 0/0:0,6,67:2,0 0/0:0,6,67:2,0 0/0:0,3,60:1,0 0/0:0,21,147:7,0 0/0:0,24,161:8,0 0/0:0,6,52:2,0 0/0:0,3,38:1,0 0/0:0,6,72:2,0 0/0:0,18,197:6,0 0/0:0,21,175:7,0 0/0:0,12,129:4,0 0/0:0,21,220:7,0 0/0:0,9,111:3,0 0/0:0,6,76:2,0 0/0:0,15,125:5,0 0/0:0,3,32:1,0 0/0:0,12,110:4,0 0/0:0,3,41:1,0 0/0:0,12,124:4,0 0/0:0,6,70:2,0 0/0:0,18,140:6,0 0/0:0,3,22:1,0 0/0:0,6,63:2,0 0/0:0,3,46:1,0 0/0:0,3,39:1,0 0/0:0,3,42:1,0 0/0:0,12,142:4,0 0/0:0,6,67:2,0 0/0:0,3,39:1,0 0/0:0,6,54:2,0 diff --git a/test/call.af-fixation.3.out b/test/call.af-fixation.3.out new file mode 100644 index 000000000..37644417b --- /dev/null +++ b/test/call.af-fixation.3.out @@ -0,0 +1,15 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07051_CEU NA12843_CEU NA18488_YRI NA18489_YRI NA18498_YRI NA18504_YRI NA18510_YRI NA18519_YRI NA18858_YRI NA18861_YRI NA18868_YRI NA18871_YRI NA18874_YRI NA18881_YRI NA18907_YRI NA18909_YRI NA18917_YRI NA18933_YRI NA18934_YRI NA19108_YRI NA19119_YRI NA19129_YRI NA19131_YRI NA19138_YRI NA19147_YRI NA19189_YRI NA19197_YRI NA19198_YRI NA19206_YRI NA19214_YRI NA19223_YRI NA19238_YRI NA19239_YRI NA19248_YRI NA19256_YRI +2 136608646 . G A 4.92443 . AC=2;AN=70;DP4=95,24,45,11;MQ=59 GT:PL:AD:GP:GQ 0/1:32,3,0:0,1:0.000315005,0.500435,0.49925:3 0/1:0,3,37:1,0:0.499357,0.500543,9.96349e-05:3 0/0:0,6,77:2,0:1,0,0:127 0/0:0,3,33:1,0:1,0,0:127 0/0:0,6,67:2,0:1,0,0:127 0/0:0,6,67:2,0:1,0,0:127 0/0:0,6,67:2,0:1,0,0:127 0/0:0,3,60:1,0:1,0,0:127 0/0:0,21,147:7,0:1,0,0:127 0/0:0,24,161:8,0:1,0,0:127 0/0:0,6,52:2,0:1,0,0:127 0/0:0,3,38:1,0:1,0,0:127 0/0:0,6,72:2,0:1,0,0:127 0/0:0,18,197:6,0:1,0,0:127 0/0:0,21,175:7,0:1,0,0:127 0/0:0,12,129:4,0:1,0,0:127 0/0:0,21,220:7,0:1,0,0:127 0/0:0,9,111:3,0:1,0,0:127 0/0:0,6,76:2,0:1,0,0:127 0/0:0,15,125:5,0:1,0,0:127 0/0:0,3,32:1,0:1,0,0:127 0/0:0,12,110:4,0:1,0,0:127 0/0:0,3,41:1,0:1,0,0:127 0/0:0,12,124:4,0:1,0,0:127 0/0:0,6,70:2,0:1,0,0:127 0/0:0,18,140:6,0:1,0,0:127 0/0:0,3,22:1,0:1,0,0:127 0/0:0,6,63:2,0:1,0,0:127 0/0:0,3,46:1,0:1,0,0:127 0/0:0,3,39:1,0:1,0,0:127 0/0:0,3,42:1,0:1,0,0:127 0/0:0,12,142:4,0:1,0,0:127 0/0:0,6,67:2,0:1,0,0:127 0/0:0,3,39:1,0:1,0,0:127 0/0:0,6,54:2,0:1,0,0:127 diff --git a/test/call.af-fixation.txt b/test/call.af-fixation.txt new file mode 100644 index 000000000..03892bb55 --- /dev/null +++ b/test/call.af-fixation.txt @@ -0,0 +1,35 @@ +NA07051_CEU CEU +NA12843_CEU CEU +NA18488_YRI YRI +NA18489_YRI YRI +NA18498_YRI YRI +NA18504_YRI YRI +NA18510_YRI YRI +NA18519_YRI YRI +NA18858_YRI YRI +NA18861_YRI YRI +NA18868_YRI YRI +NA18871_YRI YRI +NA18874_YRI YRI +NA18881_YRI YRI +NA18907_YRI YRI +NA18909_YRI YRI +NA18917_YRI YRI +NA18933_YRI YRI +NA18934_YRI YRI +NA19108_YRI YRI +NA19119_YRI YRI +NA19129_YRI YRI +NA19131_YRI YRI +NA19138_YRI YRI +NA19147_YRI YRI +NA19189_YRI YRI +NA19197_YRI YRI +NA19198_YRI YRI +NA19206_YRI YRI +NA19214_YRI YRI +NA19223_YRI YRI +NA19238_YRI YRI +NA19239_YRI YRI +NA19248_YRI YRI +NA19256_YRI YRI diff --git a/test/call.af-fixation.vcf b/test/call.af-fixation.vcf new file mode 100644 index 000000000..700178c7e --- /dev/null +++ b/test/call.af-fixation.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##reference=file:///lustre/scratch116/vr/ref/human/GRCh37/hs37d5.fa +##contig= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07051_CEU NA12843_CEU NA18488_YRI NA18489_YRI NA18498_YRI NA18504_YRI NA18510_YRI NA18519_YRI NA18858_YRI NA18861_YRI NA18868_YRI NA18871_YRI NA18874_YRI NA18881_YRI NA18907_YRI NA18909_YRI NA18917_YRI NA18933_YRI NA18934_YRI NA19108_YRI NA19119_YRI NA19129_YRI NA19131_YRI NA19138_YRI NA19147_YRI NA19189_YRI NA19197_YRI NA19198_YRI NA19206_YRI NA19214_YRI NA19223_YRI NA19238_YRI NA19239_YRI NA19248_YRI NA19256_YRI +2 136608646 . G A,T,<*> 0 . I16=95,24,45,11,4466,172210,1917,71261,7086,423410,3283,194379,2130,47014,1038,22600;QS=40.8826,20.7918,0.325581,0 PL:AD 32,3,0,32,3,32,32,3,32,32:0,1,0,0 0,3,37,3,37,37,3,37,37,37:1,0,0,0 0,6,77,6,77,77,6,77,77,77:2,0,0,0 0,3,33,3,33,33,3,33,33,33:1,0,0,0 0,6,67,6,67,67,6,67,67,67:2,0,0,0 0,6,67,6,67,67,6,67,67,67:2,0,0,0 0,6,67,6,67,67,6,67,67,67:2,0,0,0 0,3,60,3,60,60,3,60,60,60:1,0,0,0 0,21,147,21,147,147,21,147,147,147:7,0,0,0 0,24,161,24,161,161,24,161,161,161:8,0,0,0 0,6,52,6,52,52,6,52,52,52:2,0,0,0 0,3,38,3,38,38,3,38,38,38:1,0,0,0 0,6,72,6,72,72,6,72,72,72:2,0,0,0 0,18,197,18,197,197,18,197,197,197:6,0,0,0 0,21,175,21,175,175,21,175,175,175:7,0,0,0 0,12,129,12,129,129,12,129,129,129:4,0,0,0 0,21,220,21,220,220,21,220,220,220:7,0,0,0 0,9,111,9,111,111,9,111,111,111:3,0,0,0 0,6,76,6,76,76,6,76,76,76:2,0,0,0 0,15,125,15,125,125,15,125,125,125:5,0,0,0 0,3,32,3,32,32,3,32,32,32:1,0,0,0 0,12,110,12,110,110,12,110,110,110:4,0,0,0 0,3,41,3,41,41,3,41,41,41:1,0,0,0 0,12,124,12,124,124,12,124,124,124:4,0,0,0 0,6,70,6,70,70,6,70,70,70:2,0,0,0 0,18,140,18,140,140,18,140,140,140:6,0,0,0 0,3,22,3,22,22,3,22,22,22:1,0,0,0 0,6,63,6,63,63,6,63,63,63:2,0,0,0 0,3,46,3,46,46,3,46,46,46:1,0,0,0 0,3,39,3,39,39,3,39,39,39:1,0,0,0 0,3,42,3,42,42,3,42,42,42:1,0,0,0 0,12,142,12,142,142,12,142,142,142:4,0,0,0 0,6,67,6,67,67,6,67,67,67:2,0,0,0 0,3,39,3,39,39,3,39,39,39:1,0,0,0 0,6,54,6,54,54,6,54,54,54:2,0,0,0 diff --git a/test/check.chk b/test/check.chk index 8a2ce22f6..31e4eb35b 100644 --- a/test/check.chk +++ b/test/check.chk @@ -38,17 +38,19 @@ AF 0 0.000000 3 1 2 2 0 0 2 AF 0 0.490000 0 0 0 12 0 0 12 AF 0 0.740000 1 1 0 0 0 0 0 AF 0 0.990000 1 1 0 0 0 0 0 -# QUAL, Stats by quality: +# QUAL, Stats by quality # QUAL [2]id [3]Quality [4]number of SNPs [5]number of transitions (1st ALT) [6]number of transversions (1st ALT) [7]number of indels -QUAL 0 12 1 0 1 1 -QUAL 0 45 0 0 0 1 -QUAL 0 59 2 1 0 3 -QUAL 0 60 1 1 0 0 -QUAL 0 61 0 0 0 1 -QUAL 0 79 0 0 0 1 -QUAL 0 82 0 0 0 1 -QUAL 0 90 1 1 0 0 -QUAL 0 342 0 0 0 1 +QUAL 0 12.6 1 0 1 0 +QUAL 0 12.9 0 0 0 1 +QUAL 0 45.0 0 0 0 1 +QUAL 0 59.2 1 1 0 0 +QUAL 0 59.9 0 0 0 3 +QUAL 0 60.2 1 1 0 0 +QUAL 0 61.5 0 0 0 1 +QUAL 0 79.0 0 0 0 1 +QUAL 0 82.7 0 0 0 1 +QUAL 0 90.6 1 1 0 0 +QUAL 0 342.0 0 0 0 1 # IDD, InDel distribution: # IDD [2]id [3]length (deletions negative) [4]number of sites [5]number of genotypes [6]mean VAF IDD 0 -10 1 0 . diff --git a/test/check_merge.chk b/test/check_merge.chk index 72fab0929..2f54579e4 100644 --- a/test/check_merge.chk +++ b/test/check_merge.chk @@ -54,12 +54,14 @@ DP 0 60 0 0.000000 1 33.333333 DP 0 62 0 0.000000 2 66.666667 # QUAL, Stats by quality # QUAL [2]id [3]Quality [4]number of SNPs [5]number of transitions (1st ALT) [6]number of transversions (1st ALT) [7]number of indels -QUAL 0 12 1 0 1 1 -QUAL 0 45 0 0 0 1 -QUAL 0 59 2 1 0 3 -QUAL 0 60 1 1 0 0 -QUAL 0 61 0 0 0 1 -QUAL 0 79 0 0 0 1 -QUAL 0 82 0 0 0 1 -QUAL 0 90 1 1 0 0 -QUAL 0 342 0 0 0 1 +QUAL 0 12.6 1 0 1 0 +QUAL 0 12.9 0 0 0 1 +QUAL 0 45.0 0 0 0 1 +QUAL 0 59.2 1 1 0 0 +QUAL 0 59.9 0 0 0 3 +QUAL 0 60.2 1 1 0 0 +QUAL 0 61.5 0 0 0 1 +QUAL 0 79.0 0 0 0 1 +QUAL 0 82.7 0 0 0 1 +QUAL 0 90.6 1 1 0 0 +QUAL 0 342.0 0 0 0 1 diff --git a/test/mpileup.1.out b/test/mpileup.1.out index 8cedcb346..c586fb4d6 100644 --- a/test/mpileup.1.out +++ b/test/mpileup.1.out @@ -20,21 +20,19 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 -17 302 . T TA 488 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 0/1:157,0,9:7:6 1/1:201,21,0:7:7 -17 828 . T C 409 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 -17 834 . G A 364 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 0/1:128,0,59:8:5 1/1:89,9,0:3:3 -17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 -17 1869 . A T 138 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 -17 2041 . G A 447 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 -17 2220 . G A 303 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 -17 2564 . A G 233 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 -17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0/0:0,12,144:4:0 0/1:59,0,93:5:2 -17 3587 . G A 358 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0/1:22,0,118:5:1 1/1:212,24,0:8:8 -17 3936 . A G 469 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 0/1:77,0,58:6:4 1/1:196,24,0:8:8 +17 302 . T TA 487.586 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 0/1:157,0,9:7:6 1/1:201,21,0:7:7 +17 828 . T C 409.29 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 +17 834 . G A 363.72 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 0/1:128,0,59:8:5 1/1:89,9,0:3:3 +17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 +17 1869 . A T 138.104 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 +17 2041 . G A 447.444 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 +17 2220 . G A 302.575 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 +17 2564 . A G 232.697 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 +17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0/0:0,12,144:4:0 0/1:59,0,93:5:2 +17 3587 . G A 357.834 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0/1:22,0,118:5:1 1/1:212,24,0:8:8 +17 3936 . A G 469.356 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 0/1:77,0,58:6:4 1/1:196,24,0:8:8 diff --git a/test/mpileup.2.out b/test/mpileup.2.out index d6571607a..43e022fd4 100644 --- a/test/mpileup.2.out +++ b/test/mpileup.2.out @@ -22,33 +22,31 @@ ##INFO= ##INFO= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 17 1 . A . . . END=301;MinDP=1 GT:DP ./.:5 ./.:1 ./.:3 -17 302 . T TA 488 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 0/1:157,0,9:7:6 1/1:201,21,0:7:7 +17 302 . T TA 487.586 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 0/1:157,0,9:7:6 1/1:201,21,0:7:7 17 303 . G . . . END=827;MinDP=2 GT:DP 0/0:9 0/0:2 0/0:3 -17 828 . T C 409 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 +17 828 . T C 409.29 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 17 829 . T . . . END=833;MinDP=4 GT:DP 0/0:11 0/0:8 0/0:4 -17 834 . G A 364 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 0/1:128,0,59:8:5 1/1:89,9,0:3:3 +17 834 . G A 363.72 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 0/1:128,0,59:8:5 1/1:89,9,0:3:3 17 835 . T . . . END=1664;MinDP=1 GT:DP 0/0:5 0/0:2 0/0:1 -17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 +17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 17 1666 . G . . . END=1868;MinDP=0 GT:DP 0/0:6 0/0:0 0/0:1 -17 1869 . A T 138 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 +17 1869 . A T 138.104 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 17 1870 . C . . . END=2040;MinDP=1 GT:DP 0/0:13 0/0:2 0/0:1 -17 2041 . G A 447 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 +17 2041 . G A 447.444 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 17 2042 . G . . . END=2219;MinDP=1 GT:DP 0/0:8 0/0:1 0/0:3 -17 2220 . G A 303 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 +17 2220 . G A 302.575 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 17 2221 . G . . . END=2563;MinDP=0 GT:DP 0/0:5 0/0:0 0/0:2 -17 2564 . A G 233 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 +17 2564 . A G 232.697 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 17 2565 . A . . . END=3103;MinDP=0 GT:DP 0/0:6 0/0:0 0/0:1 -17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0/0:0,12,144:4:0 0/1:59,0,93:5:2 +17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0/0:0,12,144:4:0 0/1:59,0,93:5:2 17 3105 . T . . . END=3586;MinDP=2 GT:DP 0/0:5 0/0:2 0/0:3 -17 3587 . G A 358 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0/1:22,0,118:5:1 1/1:212,24,0:8:8 +17 3587 . G A 357.834 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0/1:22,0,118:5:1 1/1:212,24,0:8:8 17 3588 . A . . . END=3935;MinDP=2 GT:DP 0/0:10 0/0:2 0/0:3 -17 3936 . A G 469 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 0/1:77,0,58:6:4 1/1:196,24,0:8:8 +17 3936 . A G 469.356 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 0/1:77,0,58:6:4 1/1:196,24,0:8:8 17 3937 . C . . . END=4101;MinDP=0 GT:DP 0/0:1 0/0:0 0/0:0 diff --git a/test/mpileup.3.out b/test/mpileup.3.out index 8cedcb346..c586fb4d6 100644 --- a/test/mpileup.3.out +++ b/test/mpileup.3.out @@ -20,21 +20,19 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 -17 302 . T TA 488 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 0/1:157,0,9:7:6 1/1:201,21,0:7:7 -17 828 . T C 409 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 -17 834 . G A 364 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 0/1:128,0,59:8:5 1/1:89,9,0:3:3 -17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 -17 1869 . A T 138 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 -17 2041 . G A 447 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 -17 2220 . G A 303 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 -17 2564 . A G 233 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 -17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0/0:0,12,144:4:0 0/1:59,0,93:5:2 -17 3587 . G A 358 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0/1:22,0,118:5:1 1/1:212,24,0:8:8 -17 3936 . A G 469 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 0/1:77,0,58:6:4 1/1:196,24,0:8:8 +17 302 . T TA 487.586 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 0/1:157,0,9:7:6 1/1:201,21,0:7:7 +17 828 . T C 409.29 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 +17 834 . G A 363.72 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 0/1:128,0,59:8:5 1/1:89,9,0:3:3 +17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 +17 1869 . A T 138.104 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 +17 2041 . G A 447.444 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 +17 2220 . G A 302.575 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 +17 2564 . A G 232.697 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 +17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0/0:0,12,144:4:0 0/1:59,0,93:5:2 +17 3587 . G A 357.834 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0/1:22,0,118:5:1 1/1:212,24,0:8:8 +17 3936 . A G 469.356 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 0/1:77,0,58:6:4 1/1:196,24,0:8:8 diff --git a/test/mpileup.4.out b/test/mpileup.4.out index 9166dfa13..b8ded5132 100644 --- a/test/mpileup.4.out +++ b/test/mpileup.4.out @@ -20,21 +20,19 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00102 HG00101 HG00100 -17 302 . T TA 488 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 1/1:201,21,0:7:7 0/1:157,0,9:7:6 0/1:167,0,96:11:6 -17 828 . T C 409 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 1/1:120,12,0:4:4 0/1:116,0,91:9:5 0/1:211,0,35:12:10 -17 834 . G A 364 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 1/1:89,9,0:3:3 0/1:128,0,59:8:5 0/1:185,0,46:11:9 -17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/1:35,0,51:4:2 0/0:0,27,222:9:0 0/0:0,21,185:7:0 -17 1869 . A T 138 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 1/1:42,3,0:1:1 0/1:16,0,104:5:1 0/1:115,0,224:18:7 -17 2041 . G A 447 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 1/1:223,21,0:7:7 0/1:32,0,24:2:1 0/1:229,0,212:21:11 -17 2220 . G A 303 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 1/1:131,12,0:4:4 0/1:69,0,46:4:2 0/1:139,0,130:12:6 -17 2564 . A G 233 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 1/1:124,12,0:4:4 0/1:57,0,56:4:2 0/1:88,0,78:6:3 -17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/1:59,0,93:5:2 0/0:0,12,144:4:0 0/0:0,48,255:16:0 -17 3587 . G A 358 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 1/1:212,24,0:8:8 0/1:22,0,118:5:1 0/1:161,0,184:14:7 -17 3936 . A G 469 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 1/1:196,24,0:8:8 0/1:77,0,58:6:4 0/1:233,0,206:20:11 +17 302 . T TA 487.586 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 1/1:201,21,0:7:7 0/1:157,0,9:7:6 0/1:167,0,96:11:6 +17 828 . T C 409.29 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 1/1:120,12,0:4:4 0/1:116,0,91:9:5 0/1:211,0,35:12:10 +17 834 . G A 363.72 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;AC=4;AN=6;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 1/1:89,9,0:3:3 0/1:128,0,59:8:5 0/1:185,0,46:11:9 +17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/1:35,0,51:4:2 0/0:0,27,222:9:0 0/0:0,21,185:7:0 +17 1869 . A T 138.104 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 1/1:42,3,0:1:1 0/1:16,0,104:5:1 0/1:115,0,224:18:7 +17 2041 . G A 447.444 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 1/1:223,21,0:7:7 0/1:32,0,24:2:1 0/1:229,0,212:21:11 +17 2220 . G A 302.575 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 1/1:131,12,0:4:4 0/1:69,0,46:4:2 0/1:139,0,130:12:6 +17 2564 . A G 232.697 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 1/1:124,12,0:4:4 0/1:57,0,56:4:2 0/1:88,0,78:6:3 +17 3104 . C T 24.2837 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;AC=1;AN=6;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/1:59,0,93:5:2 0/0:0,12,144:4:0 0/0:0,48,255:16:0 +17 3587 . G A 357.834 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;AC=4;AN=6;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 1/1:212,24,0:8:8 0/1:22,0,118:5:1 0/1:161,0,184:14:7 +17 3936 . A G 469.356 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;AC=4;AN=6;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 1/1:196,24,0:8:8 0/1:77,0,58:6:4 0/1:233,0,206:20:11 diff --git a/test/mpileup.5.out b/test/mpileup.5.out index 7606e647b..489408504 100644 --- a/test/mpileup.5.out +++ b/test/mpileup.5.out @@ -20,21 +20,19 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00102 HG00101 -17 302 . T TA 327 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 1/1:201,21,0:7:7 0/1:157,0,9:7:6 -17 828 . T C 202 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 1/1:120,12,0:4:4 0/1:116,0,91:9:5 -17 834 . G A 183 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 1/1:89,9,0:3:3 0/1:128,0,59:8:5 -17 1665 . T C 3.26671 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.3;HOB=0.125;AC=1;AN=4;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/1:35,0,51:4:2 0/0:0,27,222:9:0 -17 1869 . A T 25.1744 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 1/1:42,3,0:1:1 0/1:16,0,104:5:1 -17 2041 . G A 221 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 1/1:223,21,0:7:7 0/1:32,0,24:2:1 -17 2220 . G A 166 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 1/1:131,12,0:4:4 0/1:69,0,46:4:2 -17 2564 . A G 147 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 1/1:124,12,0:4:4 0/1:57,0,56:4:2 -17 3104 . C T 24.6136 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;ICB=0.3;HOB=0.125;AC=1;AN=4;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/1:59,0,93:5:2 0/0:0,12,144:4:0 -17 3587 . G A 199 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 1/1:212,24,0:8:8 0/1:22,0,118:5:1 -17 3936 . A G 239 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;ICB=0.3;HOB=0.125;AC=3;AN=4;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 1/1:196,24,0:8:8 0/1:77,0,58:6:4 +17 302 . T TA 326.859 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;AC=3;AN=4;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 1/1:201,21,0:7:7 0/1:157,0,9:7:6 +17 828 . T C 202.292 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=3;AN=4;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 1/1:120,12,0:4:4 0/1:116,0,91:9:5 +17 834 . G A 183.191 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;AC=3;AN=4;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 1/1:89,9,0:3:3 0/1:128,0,59:8:5 +17 1665 . T C 3.26671 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=4;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/1:35,0,51:4:2 0/0:0,27,222:9:0 +17 1869 . A T 25.1744 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;AC=3;AN=4;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 1/1:42,3,0:1:1 0/1:16,0,104:5:1 +17 2041 . G A 221.292 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;AC=3;AN=4;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 1/1:223,21,0:7:7 0/1:32,0,24:2:1 +17 2220 . G A 166.425 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=3;AN=4;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 1/1:131,12,0:4:4 0/1:69,0,46:4:2 +17 2564 . A G 147.291 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=3;AN=4;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 1/1:124,12,0:4:4 0/1:57,0,56:4:2 +17 3104 . C T 24.6136 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;AC=1;AN=4;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/1:59,0,93:5:2 0/0:0,12,144:4:0 +17 3587 . G A 198.948 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;AC=3;AN=4;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 1/1:212,24,0:8:8 0/1:22,0,118:5:1 +17 3936 . A G 239.224 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;AC=3;AN=4;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 1/1:196,24,0:8:8 0/1:77,0,58:6:4 diff --git a/test/mpileup.X.2.out b/test/mpileup.X.2.out index ab82d4629..a727f707f 100644 --- a/test/mpileup.X.2.out +++ b/test/mpileup.X.2.out @@ -20,21 +20,19 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 -X 302 . T TA 482 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 1:157,9:7:6 1/1:201,21,0:7:7 -X 828 . T C 322 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 1:116,91:9:5 1/1:120,12,0:4:4 -X 834 . G A 309 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 1:128,59:8:5 1/1:89,9,0:3:3 -X 1665 . T C 3.44176 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.235294;HOB=0.18;AC=1;AN=5;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0:0,222:9:0 0/1:35,0,51:4:2 -X 1869 . A T 122 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;ICB=0.461538;HOB=0.02;AC=3;AN=5;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0:16,104:5:1 1/1:42,3,0:1:1 -X 2041 . G A 426 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 1:32,24:2:1 1/1:223,21,0:7:7 -X 2220 . G A 259 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 1:69,46:4:2 1/1:131,12,0:4:4 -X 2564 . A G 180 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 1:57,56:4:2 1/1:124,12,0:4:4 -X 3104 . C T 24.8375 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;ICB=0.235294;HOB=0.18;AC=1;AN=5;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0:0,144:4:0 0/1:59,0,93:5:2 -X 3587 . G A 335 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;ICB=0.461538;HOB=0.02;AC=3;AN=5;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0:22,118:5:1 1/1:212,24,0:8:8 -X 3936 . A G 414 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 1:77,58:6:4 1/1:196,24,0:8:8 +X 302 . T TA 482.1 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;AC=4;AN=5;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 1:157,9:7:6 1/1:201,21,0:7:7 +X 828 . T C 322.296 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=4;AN=5;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 1:116,91:9:5 1/1:120,12,0:4:4 +X 834 . G A 309.32 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;AC=4;AN=5;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 1:128,59:8:5 1/1:89,9,0:3:3 +X 1665 . T C 3.44176 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=5;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0:0,222:9:0 0/1:35,0,51:4:2 +X 1869 . A T 121.973 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;AC=3;AN=5;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0:16,104:5:1 1/1:42,3,0:1:1 +X 2041 . G A 425.853 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;AC=4;AN=5;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 1:32,24:2:1 1/1:223,21,0:7:7 +X 2220 . G A 258.867 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=4;AN=5;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 1:69,46:4:2 1/1:131,12,0:4:4 +X 2564 . A G 179.94 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=4;AN=5;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 1:57,56:4:2 1/1:124,12,0:4:4 +X 3104 . C T 24.8375 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;AC=1;AN=5;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0:0,144:4:0 0/1:59,0,93:5:2 +X 3587 . G A 335.348 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;AC=3;AN=5;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0:22,118:5:1 1/1:212,24,0:8:8 +X 3936 . A G 413.695 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;AC=4;AN=5;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 1:77,58:6:4 1/1:196,24,0:8:8 diff --git a/test/mpileup.X.out b/test/mpileup.X.out index 0e63b8efa..abce4a154 100644 --- a/test/mpileup.X.out +++ b/test/mpileup.X.out @@ -20,21 +20,19 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 -X 302 . T TA 482 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 1:157,9:7:6 1/1:201,21,0:7:7 -X 828 . T C 322 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 1:116,91:9:5 1/1:120,12,0:4:4 -X 834 . G A 309 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 1:128,59:8:5 1/1:89,9,0:3:3 -X 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 -X 1869 . A T 138 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 -X 2041 . G A 447 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 -X 2220 . G A 303 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 -X 2564 . A G 233 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 -X 3104 . C T 24.8375 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;ICB=0.235294;HOB=0.18;AC=1;AN=5;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0:0,144:4:0 0/1:59,0,93:5:2 -X 3587 . G A 335 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;ICB=0.461538;HOB=0.02;AC=3;AN=5;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0:22,118:5:1 1/1:212,24,0:8:8 -X 3936 . A G 414 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;ICB=0.235294;HOB=0.18;AC=4;AN=5;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 1:77,58:6:4 1/1:196,24,0:8:8 +X 302 . T TA 482.1 . INDEL;IDV=7;IMF=1;DP=25;VDB=0.27613;SGB=-4.22417;MQSB=0.0443614;MQ0F=0;AC=4;AN=5;DP4=2,4,8,11;MQ=49 GT:PL:DP:DV 0/1:167,0,96:11:6 1:157,9:7:6 1/1:201,21,0:7:7 +X 828 . T C 322.296 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=4;AN=5;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 1:116,91:9:5 1/1:120,12,0:4:4 +X 834 . G A 309.32 . DP=25;VDB=0.788006;SGB=-4.01214;RPB=0.999233;MQB=1;MQSB=1;BQB=0.821668;MQ0F=0;AC=4;AN=5;DP4=2,3,7,10;MQ=60 GT:PL:DP:DV 0/1:185,0,46:11:9 1:128,59:8:5 1/1:89,9,0:3:3 +X 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 +X 1869 . A T 138.104 . DP=24;VDB=0.928022;SGB=-11.9537;RPB=0.984127;MQB=0.96464;MQSB=0.931547;BQB=0.359155;MQ0F=0;AC=4;AN=6;DP4=6,9,5,4;MQ=58 GT:PL:DP:DV 0/1:115,0,224:18:7 0/1:16,0,104:5:1 1/1:42,3,0:1:1 +X 2041 . G A 447.444 . DP=31;VDB=0.816435;SGB=-4.18892;RPB=0.88473;MQB=0.972375;MQSB=0.968257;BQB=0.311275;MQ0F=0;AC=4;AN=6;DP4=6,5,12,7;MQ=58 GT:PL:DP:DV 0/1:229,0,212:21:11 0/1:32,0,24:2:1 1/1:223,21,0:7:7 +X 2220 . G A 302.575 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=4;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/1:139,0,130:12:6 0/1:69,0,46:4:2 1/1:131,12,0:4:4 +X 2564 . A G 232.697 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=4;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/1:88,0,78:6:3 0/1:57,0,56:4:2 1/1:124,12,0:4:4 +X 3104 . C T 24.8375 . DP=25;VDB=0.8;SGB=0.346553;RPB=0.717391;MQB=0.956522;MQSB=0.962269;BQB=0.978261;MQ0F=0;AC=1;AN=5;DP4=8,15,2,0;MQ=58 GT:PL:DP:DV 0/0:0,48,255:16:0 0:0,144:4:0 0/1:59,0,93:5:2 +X 3587 . G A 335.348 . DP=29;VDB=0.902044;SGB=-3.91326;RPB=0.800999;MQB=1;MQSB=1;BQB=0.156944;MQ0F=0;AC=3;AN=5;DP4=4,7,10,6;MQ=60 GT:PL:DP:DV 0/1:161,0,184:14:7 0:22,118:5:1 1/1:212,24,0:8:8 +X 3936 . A G 413.695 . DP=37;VDB=0.0574114;SGB=-4.60123;RPB=0.741697;MQB=0.812605;MQSB=0.143788;BQB=0.883831;MQ0F=0;AC=4;AN=5;DP4=5,6,6,17;MQ=56 GT:PL:DP:DV 0/1:233,0,206:20:11 1:77,58:6:4 1/1:196,24,0:8:8 diff --git a/test/mpileup.cAls.2.out b/test/mpileup.cAls.2.out index 994bf4291..b27a92c8a 100644 --- a/test/mpileup.cAls.2.out +++ b/test/mpileup.cAls.2.out @@ -18,16 +18,14 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 -chr1 212740 . A G 483 . DP=73;VDB=0.520868;SGB=-1.38232;MQSB=1;MQ0F=0;AC=4;AN=4;DP4=0,0,39,4;MQ=60 GT:PL:DP:AD 1/1:255,72,0:24:0,24 1/1:255,57,0:19:0,19 -chr1 320055 . A G 534 . DP=101;MQSB=1;MQ0F=0;AC=0;AN=4;DP4=52,9,0,0;MQ=60 GT:PL:DP:AD 0/0:0,87,255:29:29,0 0/0:0,96,255:32:32,0 -chr1 486173 . A T 106 . DP=13;VDB=0.074936;SGB=0.620439;RPB=0.810265;MQB=1.01283;MQSB=1;BQB=0.810265;MQ0F=0;ICB=0.3;HOB=0.125;AC=1;AN=4;DP4=3,1,3,0;MQ=60 GT:PL:DP:AD 0/0:0,9,151:3:3,0 0/1:140,0,48:4:1,3 -chr1 511277 . A G 483 . DP=50;VDB=0.0722735;SGB=-1.26186;MQSB=1;MQ0F=0;AC=4;AN=4;DP4=0,0,25,4;MQ=60 GT:PL:DP:AD 1/1:255,30,0:10:0,10 1/1:255,57,0:19:0,19 +chr1 212740 . A G 483.052 . DP=73;VDB=0.520868;SGB=-1.38232;MQSB=1;MQ0F=0;AC=4;AN=4;DP4=0,0,39,4;MQ=60 GT:PL:DP:AD 1/1:255,72,0:24:0,24 1/1:255,57,0:19:0,19 +chr1 320055 . A G 533.95 . DP=101;MQSB=1;MQ0F=0;AC=0;AN=4;DP4=52,9,0,0;MQ=60 GT:PL:DP:AD 0/0:0,87,255:29:29,0 0/0:0,96,255:32:32,0 +chr1 486173 . A T 106.286 . DP=13;VDB=0.074936;SGB=0.620439;RPB=0.810265;MQB=1.01283;MQSB=1;BQB=0.810265;MQ0F=0;AC=1;AN=4;DP4=3,1,3,0;MQ=60 GT:PL:DP:AD 0/0:0,9,151:3:3,0 0/1:140,0,48:4:1,3 +chr1 511277 . A G 483.052 . DP=50;VDB=0.0722735;SGB=-1.26186;MQSB=1;MQ0F=0;AC=4;AN=4;DP4=0,0,25,4;MQ=60 GT:PL:DP:AD 1/1:255,30,0:10:0,10 1/1:255,57,0:19:0,19 chr1 602567 . A G 7.04355 . DP=9;SGB=-0.516033;RPB=1;MQB=1;MQSB=1;BQB=1;MQ0F=0;AC=0;AN=4;DP4=3,1,1,0;MQ=60 GT:PL:DP:AD 0/0:0,3,60:1:1,0 0/0:29,0,140:4:3,1 -chr1 639707 . T A 483 . DP=50;VDB=0.563111;SGB=-1.37269;MQSB=1;MQ0F=0;AC=4;AN=4;DP4=0,0,23,8;MQ=60 GT:PL:DP:AD 1/1:255,42,0:14:0,14 1/1:255,51,0:17:0,17 +chr1 639707 . T A 483.052 . DP=50;VDB=0.563111;SGB=-1.37269;MQSB=1;MQ0F=0;AC=4;AN=4;DP4=0,0,23,8;MQ=60 GT:PL:DP:AD 1/1:255,42,0:14:0,14 1/1:255,51,0:17:0,17 diff --git a/test/mpileup.cAls.3.out b/test/mpileup.cAls.3.out index 3550a5b88..66d58b6b2 100644 --- a/test/mpileup.cAls.3.out +++ b/test/mpileup.cAls.3.out @@ -7,14 +7,12 @@ ##INFO= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample -20 69066 . C G 60 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 +20 69066 . C G 59.5765 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 20 69093 . G A . . . GT . 20 69094 . G A . . . GT . 20 69408 . C T . . . GT . diff --git a/test/mpileup.cAls.4.out b/test/mpileup.cAls.4.out index 8a5124a63..ef884a104 100644 --- a/test/mpileup.cAls.4.out +++ b/test/mpileup.cAls.4.out @@ -7,15 +7,13 @@ ##INFO= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample -20 69066 . C G 60 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 -20 69076 . A G 55 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,28 +20 69066 . C G 59.5765 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 +20 69076 . A G 54.5765 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,28 20 69093 . G A . . . GT . 20 69094 . G A . . . GT . 20 69408 . C T . . . GT . diff --git a/test/mpileup.cAls.5.out b/test/mpileup.cAls.5.out index 084486ef1..3eeb85c51 100644 --- a/test/mpileup.cAls.5.out +++ b/test/mpileup.cAls.5.out @@ -7,8 +7,6 @@ ##INFO= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= @@ -17,6 +15,6 @@ 20 68799 . T C . . . GT . 20 68800 . A G . . . GT . 20 68810 . G A . . . GT . -20 69066 . C G 60 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 +20 69066 . C G 59.5765 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 20 69094 . G A . . . GT . 20 69408 . C T . . . GT . diff --git a/test/mpileup.cAls.6.out b/test/mpileup.cAls.6.out index 857d0663b..1a507325b 100644 --- a/test/mpileup.cAls.6.out +++ b/test/mpileup.cAls.6.out @@ -11,18 +11,16 @@ ##INFO= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample -1 1368828 . GT G 62 . DP=1;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=60 GT:PL 0/0:0,3,35 +1 1368828 . GT G 61.5766 . DP=1;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=60 GT:PL 0/0:0,3,35 1 1368833 . T C 39.5768 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,13 1 1368833 . TAAAAAAAAAAAAAAAA TAAAAAAAAAAAAAA 24.1741 . DP=2;AC=0;AN=2;DP4=0,1,1,0;MQ=60 GT:PL 0/0:6,0,6 1 1368833 . T G . . . GT . -1 1368834 . A T 60 . DP=1;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=60 GT:PL 0/0:0,3,33 +1 1368834 . A T 59.5765 . DP=1;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=60 GT:PL 0/0:0,3,33 16 60288 . C A . . . GT . 17 355 . G A . . . GT . 20 58799 . T C . . . GT . diff --git a/test/mpileup.cAls.7.out b/test/mpileup.cAls.7.out index 594df92b5..7168edf56 100644 --- a/test/mpileup.cAls.7.out +++ b/test/mpileup.cAls.7.out @@ -37,25 +37,23 @@ ##INFO= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A 1 790667 . C T . . . GT . -1 116844268 . GA G 59 . DP=1;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=60 GT:PL 0/0:0,3,32 +1 116844268 . GA G 58.5765 . DP=1;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=60 GT:PL 0/0:0,3,32 1 116844279 . GAA G 39.5768 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,13 1 116844450 . T A . . . GT . 20 68799 . T C . . . GT . 20 68810 . G A . . . GT . -20 69066 . C G 60 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 +20 69066 . C G 59.5765 . DP=1;MQ0F=0;AC=0;AN=2;DP4=0,1,0,0;MQ=60 GT:PL 0/0:0,3,33 20 69094 . G A . . . GT . 20 69408 . C T . . . GT . 21 9411409 . T C . . . GT . 21 9411485 . C A . . . GT . 21 9411497 . A G . . . GT . -21 9412485 . C G 79 . DP=2;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=52 GT:PL 0/0:0,3,52 +21 9412485 . C G 78.5768 . DP=2;MQ0F=0;AC=0;AN=2;DP4=1,0,0,0;MQ=52 GT:PL 0/0:0,3,52 16 60288 . C A . . . GT . 17 355 . G A . . . GT . diff --git a/test/mpileup.cAls.out b/test/mpileup.cAls.out index 65979687e..b267c70f2 100644 --- a/test/mpileup.cAls.out +++ b/test/mpileup.cAls.out @@ -20,21 +20,19 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00100 HG00101 HG00102 -17 1 . A G,T 52 . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 -17 2 . A T,G 52 . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 -17 3 . A C 26.0007 . DP=11;MQ0F=0;AC=0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 +17 1 . A G,T . . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 +17 2 . A T,G . . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 +17 3 . A C . . DP=11;MQ0F=0;AC=0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 17 4 . A G,T,C 21.815 . DP=11;MQ0F=0;AC=0,0,0;AN=2;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV 0/0:1,2,3,7,8,10,11,12,14,15:5:0 ./.:.:3:0 ./.:.:3:0 -17 5 . A G,T 26.0007 . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 -17 6 . A T,G 26.0007 . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 +17 5 . A G,T . . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 +17 6 . A T,G . . DP=11;MQ0F=0;AC=0,0;AN=0;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV ./.:0,0,0,0,0,0:5:0 ./.:.:3:0 ./.:.:3:0 17 7 . A T,G,C 21.5769 . DP=11;MQ0F=0;AC=0,0,0;AN=2;DP4=11,0,0,0;MQ=29 GT:PL:DP:DV 0/0:1,2,3,4,5,6,2,3,5,3:5:0 ./.:.:3:0 ./.:.:3:0 -17 828 . T C 409 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;ICB=0.8;HOB=0.222222;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 -17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;ICB=0.128205;HOB=0.0555556;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 -17 2220 . G C 189 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=0;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/0:139,157,255:12:6 0/0:69,75,119:4:2 0/0:131,131,131:4:4 -17 2564 . A AG 166 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=0;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/0:88,98,171:6:3 0/0:57,63,117:4:2 0/0:124,124,124:4:4 +17 828 . T C 409.29 . DP=25;VDB=0.842082;SGB=-4.20907;RPB=0.950652;MQB=1;MQSB=1;BQB=0.929717;MQ0F=0;AC=4;AN=6;DP4=2,4,8,11;MQ=60 GT:PL:DP:DV 0/1:211,0,35:12:10 0/1:116,0,91:9:5 1/1:120,12,0:4:4 +17 1665 . T C 3.10665 . DP=20;VDB=0.1;SGB=0.346553;RPB=0.222222;MQB=0.611111;MQSB=0.988166;BQB=0.944444;MQ0F=0;AC=1;AN=6;DP4=7,11,1,1;MQ=55 GT:PL:DP:DV 0/0:0,21,185:7:0 0/0:0,27,222:9:0 0/1:35,0,51:4:2 +17 2220 . G C 188.992 . DP=21;VDB=0.532753;SGB=-3.51597;RPB=0.964198;MQB=0.898397;MQSB=0.875769;BQB=0.0354359;MQ0F=0;AC=0;AN=6;DP4=6,2,1,11;MQ=58 GT:PL:DP:DV 0/0:139,157,255:12:6 0/0:69,75,119:4:2 0/0:131,131,131:4:4 +17 2564 . A AG 165.992 . DP=15;VDB=0.690812;SGB=-3.20711;RPB=0.197899;MQB=1;MQSB=1;BQB=0.965069;MQ0F=0;AC=0;AN=6;DP4=1,4,4,5;MQ=60 GT:PL:DP:DV 0/0:88,98,171:6:3 0/0:57,63,117:4:2 0/0:124,124,124:4:4 diff --git a/test/mpileup.cals.8.out b/test/mpileup.cals.8.out index 169c8f78a..e07af78b9 100644 --- a/test/mpileup.cals.8.out +++ b/test/mpileup.cals.8.out @@ -21,11 +21,9 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 -6 263708 . CTT C,CT 280 . DP=211;MQSB=0.901445;MQ0F=0;AC=0,0;AN=2;DP4=117,27,0,0;MQ=57 GT:PL:DP:SP:ADF:ADR:AD 0/0:0,255,255,255,255,255:144:0:117,0,0:27,0,0:144,0,0 +6 263708 . CTT C,CT 279.818 . DP=211;MQSB=0.901445;MQ0F=0;AC=0,0;AN=2;DP4=117,27,0,0;MQ=57 GT:PL:DP:SP:ADF:ADR:AD 0/0:0,255,255,255,255,255:144:0:117,0,0:27,0,0:144,0,0 diff --git a/test/mpileup.cals.9.out b/test/mpileup.cals.9.out index 35871e794..da7d723a9 100644 --- a/test/mpileup.cals.9.out +++ b/test/mpileup.cals.9.out @@ -11,8 +11,6 @@ ##INFO= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= diff --git a/test/mpileup.hwe.1.out b/test/mpileup.hwe.1.out index 684a18e65..2bc7803d3 100644 --- a/test/mpileup.hwe.1.out +++ b/test/mpileup.hwe.1.out @@ -19,9 +19,7 @@ ##INFO= ##INFO= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19213 NA19129 -15 201 . A G 202 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;AC=4;AN=4;DP4=19,17,21,15;MQ=59 GT:PL:AD 1/1:144,12,0:0,4 1/1:85,9,0:0,3 +15 201 . A G 202.049 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;AC=4;AN=4;DP4=19,17,21,15;MQ=59 GT:PL:AD 1/1:144,12,0:0,4 1/1:85,9,0:0,3 diff --git a/test/mpileup.hwe.1b.out b/test/mpileup.hwe.1b.out new file mode 100644 index 000000000..56032579d --- /dev/null +++ b/test/mpileup.hwe.1b.out @@ -0,0 +1,25 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://hwe.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##FORMAT= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19213 NA19129 +15 201 . A G 117.048 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;AC=4;AN=4;DP4=19,17,21,15;MQ=59 GT:PL:AD 1/1:144,12,0:0,4 1/1:85,9,0:0,3 diff --git a/test/mpileup.hwe.2.out b/test/mpileup.hwe.2.out index d3cc90753..9b3d3e2ef 100644 --- a/test/mpileup.hwe.2.out +++ b/test/mpileup.hwe.2.out @@ -17,11 +17,9 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07000 NA07056 NA12046 NA12144 NA12156 NA12234 NA12249 NA12282 NA12283 NA12286 NA12340 NA12341 NA12342 NA12399 NA12414 NA12546 NA12717 NA12718 NA12748 NA12749 NA12750 NA12776 NA12828 NA12843 NA18502 NA18867 NA18908 NA18933 NA18934 NA19108 NA19117 NA19119 NA19129 NA19131 NA19138 NA19147 NA19149 NA19159 NA19185 NA19189 NA19197 NA19198 NA19206 NA19213 NA19214 NA19223 NA19235 NA19236 NA19238 NA19239 NA19248 NA19256 -15 201 . A G 999 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;ICB=0.0721108;HOB=0.158099;AC=69;AN=104;DP4=19,17,21,15;MQ=59 GT:PL:AD 0/1:0,3,36:1,0 0/0:0,6,93:2,0 0/1:0,3,33:1,0 0/1:0,3,32:1,0 0/1:0,3,36:1,0 0/1:0,3,32:1,0 0/1:0,3,40:1,0 0/0:0,6,79:2,0 0/1:0,3,42:1,0 0/0:0,6,64:2,0 0/0:0,6,67:2,0 0/0:0,6,66:2,0 0/0:0,9,96:3,0 0/1:0,3,32:1,0 0/1:0,3,36:1,0 0/1:0,3,31:1,0 0/1:0,3,44:1,0 0/1:0,3,44:1,0 0/0:0,6,61:2,0 0/1:0,3,60:1,0 0/0:0,6,60:2,0 0/0:0,6,57:2,0 0/1:0,3,31:1,0 0/0:0,6,68:2,0 0/1:0,3,60:1,0 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:37,3,0:0,1 1/1:60,3,0:0,1 1/1:38,3,0:0,1 1/1:36,3,0:0,1 1/1:37,3,0:0,1 1/1:85,9,0:0,3 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:63,6,0:0,2 1/1:29,3,0:0,1 1/1:32,3,0:0,1 1/1:43,3,0:0,1 1/1:27,3,0:0,1 1/1:43,3,0:0,1 1/1:34,3,0:0,1 1/1:46,3,0:0,1 1/1:144,12,0:0,4 1/1:59,3,0:0,1 1/1:16,3,0:0,1 1/1:37,3,0:0,1 1/1:57,6,0:0,2 1/1:91,6,0:0,2 1/1:21,3,0:0,1 1/1:52,6,0:0,2 1/1:26,3,0:0,1 +15 201 . A G 1051.64 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;AC=69;AN=104;DP4=19,17,21,15;MQ=59 GT:PL:AD 0/1:0,3,36:1,0 0/0:0,6,93:2,0 0/1:0,3,33:1,0 0/1:0,3,32:1,0 0/1:0,3,36:1,0 0/1:0,3,32:1,0 0/1:0,3,40:1,0 0/0:0,6,79:2,0 0/1:0,3,42:1,0 0/0:0,6,64:2,0 0/0:0,6,67:2,0 0/0:0,6,66:2,0 0/0:0,9,96:3,0 0/1:0,3,32:1,0 0/1:0,3,36:1,0 0/1:0,3,31:1,0 0/1:0,3,44:1,0 0/1:0,3,44:1,0 0/0:0,6,61:2,0 0/1:0,3,60:1,0 0/0:0,6,60:2,0 0/0:0,6,57:2,0 0/1:0,3,31:1,0 0/0:0,6,68:2,0 0/1:0,3,60:1,0 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:37,3,0:0,1 1/1:60,3,0:0,1 1/1:38,3,0:0,1 1/1:36,3,0:0,1 1/1:37,3,0:0,1 1/1:85,9,0:0,3 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:63,6,0:0,2 1/1:29,3,0:0,1 1/1:32,3,0:0,1 1/1:43,3,0:0,1 1/1:27,3,0:0,1 1/1:43,3,0:0,1 1/1:34,3,0:0,1 1/1:46,3,0:0,1 1/1:144,12,0:0,4 1/1:59,3,0:0,1 1/1:16,3,0:0,1 1/1:37,3,0:0,1 1/1:57,6,0:0,2 1/1:91,6,0:0,2 1/1:21,3,0:0,1 1/1:52,6,0:0,2 1/1:26,3,0:0,1 diff --git a/test/mpileup.hwe.3.out b/test/mpileup.hwe.3.out index 5645f8479..76166969e 100644 --- a/test/mpileup.hwe.3.out +++ b/test/mpileup.hwe.3.out @@ -17,12 +17,10 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07000 NA07056 NA12046 NA12144 NA12156 NA12234 NA12249 NA12282 NA12283 NA12286 NA12340 NA12341 NA12342 NA12399 NA12414 NA12546 NA12717 NA12718 NA12748 NA12749 NA12750 NA12776 NA12828 NA12843 NA18502 NA18867 NA18908 NA18933 NA18934 NA19108 NA19117 NA19119 NA19129 NA19131 NA19138 NA19147 NA19149 NA19159 NA19185 NA19189 NA19197 NA19198 NA19206 NA19213 NA19214 NA19223 NA19235 NA19236 NA19238 NA19239 NA19248 NA19256 -15 198 . C A 12.8211 . DP=79;SGB=-0.627953;RPB=1;MQB=1;MQSB=0.974642;BQB=1;MQ0F=0;ICB=0.490573;HOB=0.0377219;AC=2;AN=104;DP4=40,31,0,1;MQ=59 GT:PL:AD 0/0:0,3,44:1,0 0/0:0,6,95:2,0 0/0:0,3,39:1,0 0/0:0,3,43:1,0 0/0:0,3,34:1,0 0/0:0,3,37:1,0 0/0:0,3,41:1,0 0/0:0,6,87:2,0 0/0:0,3,46:1,0 0/0:0,6,76:2,0 0/0:0,6,80:2,0 0/0:0,6,78:2,0 0/0:0,9,112:3,0 0/0:0,3,41:1,0 1/1:35,3,0:0,1 0/0:0,3,33:1,0 0/0:0,3,44:1,0 0/0:0,3,45:1,0 0/0:0,6,69:2,0 0/0:0,3,60:1,0 0/0:0,6,70:2,0 0/0:0,6,57:2,0 0/0:0,3,37:1,0 0/0:0,6,79:2,0 0/0:0,3,60:1,0 0/0:0,3,42:1,0 0/0:0,3,39:1,0 0/0:0,3,44:1,0 0/0:0,3,60:1,0 0/0:0,3,39:1,0 0/0:0,3,41:1,0 0/0:0,3,44:1,0 0/0:0,9,98:3,0 0/0:0,3,40:1,0 0/0:0,3,38:1,0 0/0:0,6,72:2,0 0/0:0,3,29:1,0 0/0:0,3,34:1,0 0/0:0,3,43:1,0 0/0:0,3,34:1,0 0/0:0,3,44:1,0 0/0:0,3,36:1,0 0/0:0,3,47:1,0 0/0:0,12,163:4,0 0/0:0,3,60:1,0 0/0:0,3,41:1,0 0/0:0,3,37:1,0 0/0:0,6,73:2,0 0/0:0,6,96:2,0 0/0:0,3,40:1,0 0/0:0,6,67:2,0 0/0:0,3,33:1,0 -15 201 . A G 999 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;ICB=5.51698e-12;HOB=0.49926;AC=54;AN=104;DP4=19,17,21,15;MQ=59 GT:PL:AD 0/0:0,3,36:1,0 0/0:0,6,93:2,0 0/0:0,3,33:1,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,32:1,0 0/0:0,3,40:1,0 0/0:0,6,79:2,0 0/0:0,3,42:1,0 0/0:0,6,64:2,0 0/0:0,6,67:2,0 0/0:0,6,66:2,0 0/0:0,9,96:3,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,31:1,0 0/0:0,3,44:1,0 0/0:0,3,44:1,0 0/0:0,6,61:2,0 0/0:0,3,60:1,0 0/0:0,6,60:2,0 0/0:0,6,57:2,0 0/0:0,3,31:1,0 0/0:0,6,68:2,0 0/0:0,3,60:1,0 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:37,3,0:0,1 1/1:60,3,0:0,1 1/1:38,3,0:0,1 1/1:36,3,0:0,1 1/1:37,3,0:0,1 1/1:85,9,0:0,3 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:63,6,0:0,2 1/1:29,3,0:0,1 1/1:32,3,0:0,1 1/1:43,3,0:0,1 1/1:27,3,0:0,1 1/1:43,3,0:0,1 1/1:34,3,0:0,1 1/1:46,3,0:0,1 1/1:144,12,0:0,4 1/1:59,3,0:0,1 1/1:16,3,0:0,1 1/1:37,3,0:0,1 1/1:57,6,0:0,2 1/1:91,6,0:0,2 1/1:21,3,0:0,1 1/1:52,6,0:0,2 1/1:26,3,0:0,1 +15 198 . C A 12.8224 . DP=79;SGB=-0.627953;RPB=1;MQB=1;MQSB=0.974642;BQB=1;MQ0F=0;AC=2;AN=104;DP4=40,31,0,1;MQ=59 GT:PL:AD 0/0:0,3,44:1,0 0/0:0,6,95:2,0 0/0:0,3,39:1,0 0/0:0,3,43:1,0 0/0:0,3,34:1,0 0/0:0,3,37:1,0 0/0:0,3,41:1,0 0/0:0,6,87:2,0 0/0:0,3,46:1,0 0/0:0,6,76:2,0 0/0:0,6,80:2,0 0/0:0,6,78:2,0 0/0:0,9,112:3,0 0/0:0,3,41:1,0 1/1:35,3,0:0,1 0/0:0,3,33:1,0 0/0:0,3,44:1,0 0/0:0,3,45:1,0 0/0:0,6,69:2,0 0/0:0,3,60:1,0 0/0:0,6,70:2,0 0/0:0,6,57:2,0 0/0:0,3,37:1,0 0/0:0,6,79:2,0 0/0:0,3,60:1,0 0/0:0,3,42:1,0 0/0:0,3,39:1,0 0/0:0,3,44:1,0 0/0:0,3,60:1,0 0/0:0,3,39:1,0 0/0:0,3,41:1,0 0/0:0,3,44:1,0 0/0:0,9,98:3,0 0/0:0,3,40:1,0 0/0:0,3,38:1,0 0/0:0,6,72:2,0 0/0:0,3,29:1,0 0/0:0,3,34:1,0 0/0:0,3,43:1,0 0/0:0,3,34:1,0 0/0:0,3,44:1,0 0/0:0,3,36:1,0 0/0:0,3,47:1,0 0/0:0,12,163:4,0 0/0:0,3,60:1,0 0/0:0,3,41:1,0 0/0:0,3,37:1,0 0/0:0,6,73:2,0 0/0:0,6,96:2,0 0/0:0,3,40:1,0 0/0:0,6,67:2,0 0/0:0,3,33:1,0 +15 201 . A G 121.59 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;AC=50;AN=104;DP4=19,17,21,15;MQ=59 GT:PL:AD 0/0:0,3,36:1,0 0/0:0,6,93:2,0 0/0:0,3,33:1,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,32:1,0 0/0:0,3,40:1,0 0/0:0,6,79:2,0 0/0:0,3,42:1,0 0/0:0,6,64:2,0 0/0:0,6,67:2,0 0/0:0,6,66:2,0 0/0:0,9,96:3,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,31:1,0 0/0:0,3,44:1,0 0/0:0,3,44:1,0 0/0:0,6,61:2,0 0/0:0,3,60:1,0 0/0:0,6,60:2,0 0/0:0,6,57:2,0 0/0:0,3,31:1,0 0/0:0,6,68:2,0 0/0:0,3,60:1,0 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:37,3,0:0,1 1/1:60,3,0:0,1 1/1:38,3,0:0,1 1/1:36,3,0:0,1 1/1:37,3,0:0,1 1/1:85,9,0:0,3 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:63,6,0:0,2 1/1:29,3,0:0,1 1/1:32,3,0:0,1 1/1:43,3,0:0,1 1/1:27,3,0:0,1 1/1:43,3,0:0,1 1/1:34,3,0:0,1 1/1:46,3,0:0,1 1/1:144,12,0:0,4 1/1:59,3,0:0,1 0/0:16,3,0:0,1 1/1:37,3,0:0,1 1/1:57,6,0:0,2 1/1:91,6,0:0,2 0/0:21,3,0:0,1 1/1:52,6,0:0,2 1/1:26,3,0:0,1 diff --git a/test/mpileup.hwe.4.out b/test/mpileup.hwe.4.out index b1509eabc..b84853c81 100644 --- a/test/mpileup.hwe.4.out +++ b/test/mpileup.hwe.4.out @@ -17,11 +17,9 @@ ##FORMAT= ##FORMAT= ##FORMAT= -##INFO= -##INFO= ##INFO= ##INFO= ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA07000 NA07056 NA12046 NA12144 NA12156 NA12234 NA12249 NA12282 NA12283 NA12286 NA12340 NA12341 NA12342 NA12399 NA12414 NA12546 NA12717 NA12718 NA12748 NA12749 NA12750 NA12776 NA12828 NA12843 NA18502 NA18867 NA18908 NA18933 NA18934 NA19108 NA19117 NA19119 NA19129 NA19131 NA19138 NA19147 NA19149 NA19159 NA19185 NA19189 NA19197 NA19198 NA19206 NA19213 NA19214 NA19223 NA19235 NA19236 NA19238 NA19239 NA19248 NA19256 -15 201 . A G 999 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;ICB=4.28809e-11;HOB=0.479105;AC=55;AN=104;DP4=19,17,21,15;MQ=59 GT:PL:AD 0/0:0,3,36:1,0 0/0:0,6,93:2,0 0/0:0,3,33:1,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,32:1,0 0/0:0,3,40:1,0 0/0:0,6,79:2,0 0/0:0,3,42:1,0 0/0:0,6,64:2,0 0/0:0,6,67:2,0 0/0:0,6,66:2,0 0/0:0,9,96:3,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,31:1,0 0/0:0,3,44:1,0 0/0:0,3,44:1,0 0/0:0,6,61:2,0 0/0:0,3,60:1,0 0/0:0,6,60:2,0 0/0:0,6,57:2,0 0/0:0,3,31:1,0 0/0:0,6,68:2,0 0/1:0,3,60:1,0 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:37,3,0:0,1 1/1:60,3,0:0,1 1/1:38,3,0:0,1 1/1:36,3,0:0,1 1/1:37,3,0:0,1 1/1:85,9,0:0,3 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:63,6,0:0,2 1/1:29,3,0:0,1 1/1:32,3,0:0,1 1/1:43,3,0:0,1 1/1:27,3,0:0,1 1/1:43,3,0:0,1 1/1:34,3,0:0,1 1/1:46,3,0:0,1 1/1:144,12,0:0,4 1/1:59,3,0:0,1 1/1:16,3,0:0,1 1/1:37,3,0:0,1 1/1:57,6,0:0,2 1/1:91,6,0:0,2 1/1:21,3,0:0,1 1/1:52,6,0:0,2 1/1:26,3,0:0,1 +15 201 . A G 1211.32 . DP=80;VDB=0.895839;SGB=-4.94122;RPB=0.999937;MQB=0.979662;MQSB=0.974642;BQB=0.739382;MQ0F=0;AC=55;AN=104;DP4=19,17,21,15;MQ=59 GT:PL:AD 0/0:0,3,36:1,0 0/0:0,6,93:2,0 0/0:0,3,33:1,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,32:1,0 0/0:0,3,40:1,0 0/0:0,6,79:2,0 0/0:0,3,42:1,0 0/0:0,6,64:2,0 0/0:0,6,67:2,0 0/0:0,6,66:2,0 0/0:0,9,96:3,0 0/0:0,3,32:1,0 0/0:0,3,36:1,0 0/0:0,3,31:1,0 0/0:0,3,44:1,0 0/0:0,3,44:1,0 0/0:0,6,61:2,0 0/0:0,3,60:1,0 0/0:0,6,60:2,0 0/0:0,6,57:2,0 0/0:0,3,31:1,0 0/0:0,6,68:2,0 0/1:0,3,60:1,0 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:37,3,0:0,1 1/1:60,3,0:0,1 1/1:38,3,0:0,1 1/1:36,3,0:0,1 1/1:37,3,0:0,1 1/1:85,9,0:0,3 1/1:39,3,0:0,1 1/1:31,3,0:0,1 1/1:63,6,0:0,2 1/1:29,3,0:0,1 1/1:32,3,0:0,1 1/1:43,3,0:0,1 1/1:27,3,0:0,1 1/1:43,3,0:0,1 1/1:34,3,0:0,1 1/1:46,3,0:0,1 1/1:144,12,0:0,4 1/1:59,3,0:0,1 1/1:16,3,0:0,1 1/1:37,3,0:0,1 1/1:57,6,0:0,2 1/1:91,6,0:0,2 1/1:21,3,0:0,1 1/1:52,6,0:0,2 1/1:26,3,0:0,1 diff --git a/test/test.pl b/test/test.pl index 8fade275b..633d1d5db 100755 --- a/test/test.pl +++ b/test/test.pl @@ -278,10 +278,10 @@ test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.ped'); test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.2.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.2.samples'); test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1.out',args=>'-mv'); -test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1.out',args=>'-mv -G -'); +test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1b.out',args=>'-mv -G AD:-'); test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.2.out',args=>'-mv'); -test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.3.out',args=>'-mv -G -'); -test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.4.out',args=>'-mv -G {PATH}/mpileup.hwe.samples'); +test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.3.out',args=>'-mv -G AD:-'); # 21,3,0 becomes 0/0 because of the prior -P +test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.4.out',args=>'-mv -G AD:{PATH}/mpileup.hwe.samples'); test_vcf_call_cAls($opts,in=>'mpileup',out=>'mpileup.cAls.out',tab=>'mpileup'); test_vcf_call_cAls($opts,in=>'mpileup.2',out=>'mpileup.cAls.2.out',tab=>'mpileup.2'); test_vcf_call_cAls($opts,in=>'mpileup.3',out=>'mpileup.cAls.3.out',tab=>'mpileup.3',args=>'-i'); @@ -296,6 +296,12 @@ test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.samples'); test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.ped'); test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.2.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.2.samples'); +test_vcf_call($opts,in=>'call-G',out=>'call-G.1.out',args=>'-mv'); +test_vcf_call($opts,in=>'call-G',out=>'call-G.2.out',args=>'-mv -G AD:-'); +test_vcf_call($opts,in=>'call-G.2',out=>'call-G.2.1.out',args=>'-mv -F AN_POP,AC_POP'); +test_vcf_call($opts,in=>'call.af-fixation',out=>'call.af-fixation.1.out',args=>'-m'); +test_vcf_call($opts,in=>'call.af-fixation',out=>'call.af-fixation.2.out',args=>'-m -G {PATH}/call.af-fixation.txt'); +test_vcf_call($opts,in=>'call.af-fixation',out=>'call.af-fixation.3.out',args=>'-m -G {PATH}/call.af-fixation.txt -a GP,GQ'); test_vcf_filter($opts,in=>'view.filter',out=>'view.filter.6.out',args=>q[-S. -e'TXT0="text"'],reg=>''); test_vcf_filter($opts,in=>'view.filter',out=>'view.filter.7.out',args=>q[-S. -e'FMT/FRS[*:1]="BB"'],reg=>''); test_vcf_filter($opts,in=>'view.filter',out=>'view.filter.8.out',args=>q[-S. -e'FMT/FGS[*:0]="AAAAAA"'],reg=>''); diff --git a/test/trio-dnm.1.vcf b/test/trio-dnm.1.vcf deleted file mode 100644 index 7d15610dd..000000000 --- a/test/trio-dnm.1.vcf +++ /dev/null @@ -1,31 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##INFO= -##INFO= -##INFO= -##contig= -##contig= -##contig= -##reference=file:///lustre/scratch113/resources/ref/Homo_sapiens/1000Genomes_hs37d5/hs37d5.fa -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband father mother -1 1 . G A,T . . TP GT:AD:PL 0/2:62,0,27:733,919,3060,0,2141,2060 0/0:35,0,0:0,99,1485,99,1485,1485 0/0:38,0,0:0,102,1530,102,1530,1530 -1 2 . G T,A . . TP GT:AD:PL 0/1:67,14,0:257,0,2105,458,2146,2605 0/0:36,0,0:0,99,1485,99,1485,1485 0/0:36,0,0:0,99,1485,99,1485,1485 -1 3 . G A . . TP GT:AD:PL 0/1:71,14:241,0,2314 0/0:45,0:0,101,1530 0/0:43,0:0,99,1485 -1 4 . C T . . TP GT:AD:PL 0/1:111,24:504,0,3776 0/0:35,0:0,99,1485 0/0:36,0:0,99,1485 -1 5 . C A . . TP GT:AD:PL 0/1:30,6:124,0,981 0/0:37,0:0,99,1485 0/0:32,0:0,90,1350 -1 8 . A G . . TP GT:AD:PL 0/1:434,52:859,0,18086 0/0:38,0:0,99,1485 0/0:38,0:0,99,1485 -2 1 . A G . . UN GT:AD:PL 0/1:2,5:179,0,55 0/0:7,0:0,0,180 0/0:4,0:0,12,148 -2 2 . A G . . UN GT:AD:PL 0/1:4,5:159,0,126 0/0:1,0:0,3,39 0/0:4,0:0,9,135 -2 3 . A G . . UN GT:AD:PL 0/1:4,4:137,0,107 0/0:6,0:0,18,213 0/0:8,0:0,0,232 -3 1 . A G . . FP GT:AD:PL 0/1:7,9:357,0,408 0/0:15,0:0,39,585 0/1:4,3:114,0,550 -3 2 . C A . . FP GT:AD:PL 0/1:13,15:453,0,442 0/1:29,30:913,0,1011 0/0:39,0:0,99,1485 -3 3 . A G . . FP GT:AD:PL 0/1:11,12:361,0,358 0/0:21,0:0,51,765 0/1:10,15:538,0,292 -3 4 . A G,C . . FP GT:PL:AD 0/0:0,255,255,255,255,255:306,11,0 0/0:0,255,255,255,255,255:328,1,1 0/0:0,255,255,255,255,255:318,0,0 -3 5 . A G . . FP GT:AD:PL 0/1:33,32:890,0,963 0/1:56,45:1328,0,1809 0/0:36,0:0,99,1485 -3 6 . A G . . FP GT:AD:PL 0/1:19,24:737,0,649 0/0:48,0:0,108,1620 0/1:25,22:644,0,836 -3 7 . A G . . FP GT:AD:PL 0/1:73,90:2864,0,2197 0/0:42,0:0,99,1485 0/1:69,74:2395,0,2064 -3 8 . A G . . FP GT:AD:PL 0/1:115,128:4130,0,3542 0/0:34,0:0,99,1360 0/1:137,89:2571,0,4411 -3 9 . A G . . FP GT:AD:PL 0/1:18,11:311,0,627 0/1:3,3:51,0,105 0/0:19,0:0,57,764 diff --git a/test/trio-dnm.2.vcf b/test/trio-dnm.2.vcf deleted file mode 100644 index a36f5d0ff..000000000 --- a/test/trio-dnm.2.vcf +++ /dev/null @@ -1,31 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##INFO= -##INFO= -##INFO= -##contig= -##contig= -##contig= -##reference=file:///lustre/scratch113/resources/ref/Homo_sapiens/1000Genomes_hs37d5/hs37d5.fa -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband father mother -1 1 . G A,T . . TP GT:AD:PL 0/2:62,0,27,0:733,919,3060,0,2141,2060 0/0:35,0,0:0,99,1485,99,1485,1485 0/0:38,0,0:0,102,1530,102,1530,1530 -1 2 . G T,A . . TP GT:AD:PL 0/1:67,14,0,0:257,0,2105,458,2146,2605 0/0:36,0,0:0,99,1485,99,1485,1485 0/0:36,0,0:0,99,1485,99,1485,1485 -1 3 . G A . . TP GT:AD:PL 0/1:71,14,0:241,0,2314 0/0:45,0:0,101,1530 0/0:43,0:0,99,1485 -1 4 . C T . . TP GT:AD:PL 0/1:111,24,0:504,0,3776 0/0:35,0:0,99,1485 0/0:36,0:0,99,1485 -1 5 . C A . . TP GT:AD:PL 0/1:30,6,0:124,0,981 0/0:37,0:0,99,1485 0/0:32,0:0,90,1350 -1 8 . A G . . TP GT:AD:PL 0/1:434,52,0:859,0,18086 0/0:38,0:0,99,1485 0/0:38,0:0,99,1485 -2 1 . A G . . UN GT:AD:PL 0/1:2,5,0:179,0,55 0/0:7,0:0,0,180 0/0:4,0:0,12,148 -2 2 . A G . . UN GT:AD:PL 0/1:4,5,0:159,0,126 0/0:1,0:0,3,39 0/0:4,0:0,9,135 -2 3 . A G . . UN GT:AD:PL 0/1:4,4,0:137,0,107 0/0:6,0:0,18,213 0/0:8,0:0,0,232 -3 1 . A G . . FP GT:AD:PL 0/1:7,9,0,0:357,0,408 0/0:15,0:0,39,585 0/1:4,3:114,0,550 -3 2 . C A . . FP GT:AD:PL 0/1:13,15,0:453,0,442 0/1:29,30:913,0,1011 0/0:39,0:0,99,1485 -3 3 . A G . . FP GT:AD:PL 0/1:11,12,0:361,0,358 0/0:21,0:0,51,765 0/1:10,15:538,0,292 -3 4 . A G,C . . FP GT:PL:AD 0/0:0,255,255,255,255,255:306,11,0,0 0/0:0,255,255,255,255,255:328,1,1 0/0:0,255,255,255,255,255:318,0,0 -3 5 . A G . . FP GT:AD:PL 0/1:33,32,0:890,0,963 0/1:56,45:1328,0,1809 0/0:36,0:0,99,1485 -3 6 . A G . . FP GT:AD:PL 0/1:19,24,0:737,0,649 0/0:48,0:0,108,1620 0/1:25,22:644,0,836 -3 7 . A G . . FP GT:AD:PL 0/1:73,90,0:2864,0,2197 0/0:42,0:0,99,1485 0/1:69,74:2395,0,2064 -3 8 . A G . . FP GT:AD:PL 0/1:115,128,0:4130,0,3542 0/0:34,0:0,99,1360 0/1:137,89:2571,0,4411 -3 9 . A G . . FP GT:AD:PL 0/1:18,11,0:311,0,627 0/1:3,3:51,0,105 0/0:19,0:0,57,764 diff --git a/vcfcall.c b/vcfcall.c index 72387f105..0bbcadbec 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -740,6 +740,7 @@ static void destroy_data(args_t *args) free(args->samples_map); free(args->sample2sex); free(args->aux.ploidy); + free(args->aux.sample_groups); free(args->str.s); if ( args->gvcf ) gvcf_destroy(args->gvcf); bcf_hdr_destroy(args->aux.hdr); @@ -769,7 +770,20 @@ void parse_novel_rate(args_t *args, const char *str) else error("Could not parse --novel-rate %s\n", str); } -static int parse_format_flag(const char *str) +static void list_annotations(FILE *fp) +{ + fprintf(fp, + "\n" + "Optional INFO annotations available with -m (\"INFO/\" prefix is optional):\n" + " INFO/PV4 .. P-values for strand bias, baseQ bias, mapQ bias and tail distance bias (Number=4,Type=Float)\n" + "\n" + "Optional FORMAT annotations available with -m (\"FORMAT/\" prefix is optional):\n" + " FORMAT/GQ .. Phred-scaled genotype quality (Number=1,Type=Integer)\n" + " FORMAT/GP .. Phred-scaled genotype posterior probabilities (Number=G,Type=Float)\n" + "\n"); +} + +static int parse_output_tags(const char *str) { int flag = 0; const char *ss = str; @@ -777,8 +791,9 @@ static int parse_format_flag(const char *str) { const char *se = ss; while ( *se && *se!=',' ) se++; - if ( !strncasecmp(ss,"GQ",se-ss) ) flag |= CALL_FMT_GQ; - else if ( !strncasecmp(ss,"GP",se-ss) ) flag |= CALL_FMT_GP; + if ( !strncasecmp(ss,"GQ",se-ss) || !strncasecmp(ss,"FORMAT/GQ",se-ss) || !strncasecmp(ss,"FMT/GQ",se-ss) ) flag |= CALL_FMT_GQ; + else if ( !strncasecmp(ss,"GP",se-ss) || !strncasecmp(ss,"FORMAT/GP",se-ss) || !strncasecmp(ss,"FMT/GP",se-ss) ) flag |= CALL_FMT_GP; + else if ( !strncasecmp(ss,"PV4",se-ss) || !strncasecmp(ss,"INFO/PV4",se-ss) ) flag |= CALL_FMT_PV4; else { fprintf(stderr,"Could not parse \"%s\"\n", str); @@ -857,37 +872,41 @@ static void usage(args_t *args) fprintf(stderr, "Usage: bcftools call [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "File format options:\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " --ploidy [?] predefined ploidy, 'list' to print available settings, append '?' for details\n"); - fprintf(stderr, " --ploidy-file space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --samples list of samples to include [all samples]\n"); - fprintf(stderr, " -S, --samples-file PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details\n"); + fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " -s, --samples LIST List of samples to include [all samples]\n"); + fprintf(stderr, " -S, --samples-file FILE PED file or a file with an optional column with sex (see man page for details) [all samples]\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --threads INT Use multithreading with INT worker threads [0]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Input/output options:\n"); - fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -f, --format-fields output format fields: GQ,GP (lowercase allowed) []\n"); - fprintf(stderr, " -F, --prior-freqs use prior allele frequencies\n"); - fprintf(stderr, " -G, --group-samples group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling\n"); - fprintf(stderr, " -g, --gvcf ,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n"); - fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n"); - fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n"); - fprintf(stderr, " -V, --skip-variants skip indels/snps\n"); - fprintf(stderr, " -v, --variants-only output variant sites only\n"); + fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n"); +//todo? +// fprintf(stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n"); +// fprintf(stderr, " tag removal [^I16,^QS,^FMT/QS]\n"); + fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); + fprintf(stderr, " -G, --group-samples [TAG:]FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); + fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); + fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); + fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); + fprintf(stderr, " -V, --skip-variants TYPE Skip indels/snps\n"); + fprintf(stderr, " -v, --variants-only Output variant sites only\n"); fprintf(stderr, "\n"); fprintf(stderr, "Consensus/variant calling options:\n"); - fprintf(stderr, " -c, --consensus-caller the original calling method (conflicts with -m)\n"); - fprintf(stderr, " -C, --constrain one of: alleles, trio (see manual)\n"); - fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); - fprintf(stderr, " -n, --novel-rate ,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); - fprintf(stderr, " -p, --pval-threshold variant if P(ref|D) mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n"); + fprintf(stderr, " -c, --consensus-caller The original calling method (conflicts with -m)\n"); + fprintf(stderr, " -C, --constrain STR One of: alleles, trio (see manual)\n"); + fprintf(stderr, " -m, --multiallelic-caller Alternative model for multiallelic and rare-variant calling (conflicts with -c)\n"); + fprintf(stderr, " -n, --novel-rate FLOAT,[...] Likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n"); + fprintf(stderr, " -p, --pval-threshold FLOAT Variant if P(ref|D)= 0) + while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0) { switch (c) { @@ -969,8 +989,12 @@ int main_vcfcall(int argc, char *argv[]) case 1 : ploidy = optarg; break; case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; - case 'G': args.aux.sample_groups = optarg; break; - case 'f': args.aux.output_tags |= parse_format_flag(optarg); break; + case 'G': args.aux.sample_groups = strdup(optarg); break; + case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n"); + case 'a': + if (optarg[0]=='?') { list_annotations(stderr); return 1; } + args.aux.output_tags |= parse_output_tags(optarg); + break; case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default) case 'A': args.aux.flag |= CALL_KEEPALT; break; From 8a744dd8591946f6ae013fe8d18b6f7101f06da3 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 16 Dec 2020 11:53:04 +0000 Subject: [PATCH 27/81] Rename trio-dnm to trio-dnm2 to avoid confusion --- doc/bcftools.txt | 2 +- plugins/{trio-dnm.c => trio-dnm2.c} | 10 +++++----- test/test.pl | 24 ++++++++++++------------ 3 files changed, 18 insertions(+), 18 deletions(-) rename plugins/{trio-dnm.c => trio-dnm2.c} (99%) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 5ece9eb8f..14530547e 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -2245,7 +2245,7 @@ By default, appropriate system directories are searched for installed plugins. *tag2tag*:: convert between similar tags, such as GL and GP -*trio-dnm*:: +*trio-dnm2*:: screen variants for possible de-novo mutations in trios *trio-stats*:: diff --git a/plugins/trio-dnm.c b/plugins/trio-dnm2.c similarity index 99% rename from plugins/trio-dnm.c rename to plugins/trio-dnm2.c index e127ecac2..af696239e 100644 --- a/plugins/trio-dnm.c +++ b/plugins/trio-dnm2.c @@ -115,7 +115,7 @@ static const char *usage_text(void) return "\n" "About: Screen variants for possible de-novo mutations in trios\n" - "Usage: bcftools +trio-dnm [Plugin Options]\n" + "Usage: bcftools +trio-dnm2 [Plugin Options]\n" "Plugin options:\n" " -e, --exclude EXPR exclude trios for which the expression is true (one matching sample invalidates a trio)\n" " --force-AD calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" @@ -142,20 +142,20 @@ static const char *usage_text(void) "\n" "Example:\n" " # Annotate VCF with FORMAT/DNM, run for a single trio\n" - " bcftools +trio-dnm -p proband,father,mother file.bcf\n" + " bcftools +trio-dnm2 -p proband,father,mother file.bcf\n" "\n" " # Same as above, but read the trio(s) from a PED file\n" - " bcftools +trio-dnm -P file.ped file.bcf\n" + " bcftools +trio-dnm2 -P file.ped file.bcf\n" "\n" " # Same as above plus extract a list of significant DNMs using the bcftools/query command\n" - " bcftools +trio-dnm -P file.ped file.bcf -Ou | bcftools query -i'DNM>10' -f'[%CHROM:%POS %SAMPLE %DNM\\n]'\n" + " bcftools +trio-dnm2 -P file.ped file.bcf -Ou | bcftools query -i'DNM>10' -f'[%CHROM:%POS %SAMPLE %DNM\\n]'\n" "\n" " # A complete example with a variant calling step. Note that this is one long\n" " # command and should be on a single line. Also note that a filtering step is\n" " # recommended, e.g. by depth and VAF (not shown here):\n" " bcftools mpileup -a AD,QS -f ref.fa -Ou proband.bam father.bam mother.bam |\n" " bcftools call -mv -Ou |\n" - " bcftools +trio-dnm -p proband,father,mother -Oz -o output.vcf.gz\n" + " bcftools +trio-dnm2 -p proband,father,mother -Oz -o output.vcf.gz\n" "\n"; } diff --git a/test/test.pl b/test/test.pl index 633d1d5db..56af50836 100755 --- a/test/test.pl +++ b/test/test.pl @@ -486,18 +486,18 @@ test_vcf_plugin($opts,in=>'contrast',out=>'contrast.out',cmd=>'+contrast',args=>'-a PASSOC,FASSOC,NOVELAL,NOVELGT -0 {PATH}/contrast0.txt -1 {PATH}/contrast1.txt'); test_vcf_plugin($opts,in=>'contrast.1',out=>'contrast.1.1.out',cmd=>'+contrast',args=>'-a NOVELAL,NOVELGT -0 A -1 B'); test_vcf_plugin($opts,in=>'contrast.1',out=>'contrast.1.2.out',cmd=>'+contrast',args=>'-a NOVELGT -0 A -1 B'); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.1',out=>'trio-dnm/trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u ppl -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.2',out=>'trio-dnm/trio-dnm.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u ppl -u tag=DNM:log --force-AD | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss by DNG -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.2.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss, low PL -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.1',out=>'trio-dnm/trio-dnm.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u ppl -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.2',out=>'trio-dnm/trio-dnm.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u ppl -u tag=DNM:log --force-AD | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.2.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.4',out=>'trio-dnm/trio-dnm.4.2.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss by DNG +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.2.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss, low PL +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); test_vcf_plugin($opts,in=>'gvcfz',out=>'gvcfz.1.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GT!="alt"' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%END[\\t%GT][\\t%DP][\\t%GQ][\\t%RGQ]\\n']); test_vcf_plugin($opts,in=>'gvcfz',out=>'gvcfz.2.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GQ>10; FLT:-' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%FILTER\\t%END[\\t%GT][\\t%DP][\\t%GQ][\\t%RGQ]\\n']); test_vcf_plugin($opts,in=>'gvcfz.2',out=>'gvcfz.2.1.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GT!="alt"' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%FILTER\\t%END[\\t%GT][\\t%DP]\\n']); From d31436da69714cf432a850597d9ece9076fbed9e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 12 Jan 2021 11:34:40 +0000 Subject: [PATCH 28/81] Update CI test configuration to recurse htslib submodules In anticipation of samtools/htslib#929 merge, which adds a submodule. Also makes everything use autoreconf -i when setting up HTSlib as that will be required for autoconf 2.70. Increases the memory limit for the address sanitizer cirrus-ci test. While the average memory used seems to be low, occasionally some test processes may get big enough to trigger an out of memory killer. Note that both Travis and cirrus-ci use the .travis/clone script. --- .appveyor.yml | 6 +++--- .cirrus.yml | 8 +++++--- .travis.yml | 4 ++-- .travis/clone | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 02554d74e..49aa617a2 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -35,15 +35,15 @@ install: clone_script: - "sh -lc \"if test x$APPVEYOR_PULL_REQUEST_HEAD_REPO_NAME != x ; then git clone --branch=$APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH https://github.com/$APPVEYOR_PULL_REQUEST_HEAD_REPO_NAME $APPVEYOR_BUILD_FOLDER ; else false ; fi || git clone --branch=$APPVEYOR_REPO_BRANCH https://github.com/$APPVEYOR_REPO_NAME $APPVEYOR_BUILD_FOLDER\"" - "sh -lc \"git show-branch --sha1-name HEAD" - - "sh -lc \"git clone --branch=$APPVEYOR_REPO_BRANCH https://github.com/`echo $APPVEYOR_REPO_NAME|sed 's#/bcftools#/htslib#'`.git $APPVEYOR_BUILD_FOLDER/htslib || git clone https://github.com/samtools/htslib.git $APPVEYOR_BUILD_FOLDER/htslib \"" + - "sh -lc \"git clone --recurse-submodules --shallow-submodules --branch=$APPVEYOR_REPO_BRANCH https://github.com/`echo $APPVEYOR_REPO_NAME|sed 's#/bcftools#/htslib#'`.git $APPVEYOR_BUILD_FOLDER/htslib || git clone --recurse-submodules --shallow-submodules https://github.com/samtools/htslib.git $APPVEYOR_BUILD_FOLDER/htslib \"" - "sh -lc \"cd $APPVEYOR_BUILD_FOLDER/htslib && git show-branch --sha1-name HEAD\"" build_script: - set HOME=. - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - - "sh -lc \"(cd htslib; aclocal && autoheader && autoconf)\"" - - "sh -lc \"aclocal && autoheader && autoconf && ./configure && make -j2\"" + - "sh -lc \"(cd htslib; autoreconf -i)\"" + - "sh -lc \"autoreconf -i && ./configure && make -j2\"" test_script: - set HOME=. diff --git a/.cirrus.yml b/.cirrus.yml index 3e0d8e27f..c1df245c7 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -32,8 +32,8 @@ compile_template: &COMPILE compile_script: | if test "$USE_CONFIG" = "yes"; then - (cd $HTSDIR && autoreconf) - autoreconf + (cd $HTSDIR && autoreconf -i) + autoreconf -i ./configure || (cat config.log; /bin/false) make -j3 else @@ -89,7 +89,9 @@ ubuntu_task: matrix: - environment: USE_CONFIG: no - - environment: + - container: + memory: 2G + environment: USE_CONFIG: yes CFLAGS: -g -Wall -O3 -fsanitize=address LDFLAGS: -fsanitize=address -Wl,-rpath,`pwd`/inst/lib diff --git a/.travis.yml b/.travis.yml index 14bb9ed9a..a882e5722 100644 --- a/.travis.yml +++ b/.travis.yml @@ -59,8 +59,8 @@ before_script: script: | if test "$USE_CONFIG" = "yes"; then - ( cd "$HTSDIR" && autoreconf ) && \ - autoreconf && \ + ( cd "$HTSDIR" && autoreconf -i) && \ + autoreconf -i && \ ./configure && \ make && \ make test diff --git a/.travis/clone b/.travis/clone index f4b823c23..a561f9150 100755 --- a/.travis/clone +++ b/.travis/clone @@ -14,4 +14,4 @@ ref='' [ -z "$ref" ] && repository='git://github.com/samtools/htslib.git' set -x -git clone --depth=1 ${ref:+--branch="$branch"} "$repository" "$localdir" +git clone --recurse-submodules --shallow-submodules --depth=1 ${ref:+--branch="$branch"} "$repository" "$localdir" From e4bc1b725b62f41289b0e10f8daaeeee4eeddcbc Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 19 Jan 2021 15:42:17 +0000 Subject: [PATCH 29/81] Add explicit --group-samples-tag option. Resolves #1370 --- mcall.c | 26 +++++++++----------------- test/test.pl | 8 ++++---- vcfcall.c | 8 +++++--- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/mcall.c b/mcall.c index e02157a85..b30f1b007 100644 --- a/mcall.c +++ b/mcall.c @@ -262,15 +262,8 @@ static void init_sample_groups(call_t *call) return; } - // Parse tag (optional) and file name - char *fname = call->sample_groups; - while ( *fname && *fname!=':' ) fname++; - if ( *fname ) + if ( call->sample_groups_tag ) { - call->sample_groups_tag = call->sample_groups; - *fname = 0; - fname++; - // Is the tag defined in the header? int tag_id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->sample_groups_tag); if ( tag_id==-1 ) error("No such tag \"%s\"\n",call->sample_groups_tag); @@ -286,11 +279,10 @@ static void init_sample_groups(call_t *call) if ( tag_id >= 0 && bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,tag_id) ) call->sample_groups_tag = "AD"; else error("Error: neither \"AD\" nor \"QS\" FORMAT tag exists and no alternative given with -G\n"); } - fname = call->sample_groups; } // Read samples/groups - if ( !strcmp("-",fname) ) + if ( !strcmp("-",call->sample_groups) ) { // single-sample calling, each sample creates its own group call->nsmpl_grp = nsmpl; @@ -305,8 +297,8 @@ static void init_sample_groups(call_t *call) else { int nlines; - char **lines = hts_readlist(fname, 1, &nlines); - if ( !lines ) error("Could not read the file: %s\n", fname); + char **lines = hts_readlist(call->sample_groups, 1, &nlines); + if ( !lines ) error("Could not read the file: %s\n", call->sample_groups); uint32_t *smpl2grp = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); uint32_t *grp2n = (uint32_t*)calloc(nsmpl,sizeof(uint32_t)); @@ -317,14 +309,14 @@ static void init_sample_groups(call_t *call) { char *ptr = lines[i]; while ( *ptr && !isspace(*ptr) ) ptr++; - if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",fname,lines[i]); + if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]); char *tmp = ptr; while ( *ptr && isspace(*ptr) ) ptr++; - if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",fname,lines[i]); + if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]); *tmp = 0; int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); if ( ismpl<0 ) continue; - if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],fname); + if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); if ( !khash_str2int_has_key(grp2idx,ptr+1) ) { khash_str2int_set(grp2idx, ptr+1, call->nsmpl_grp); @@ -337,12 +329,12 @@ static void init_sample_groups(call_t *call) smpl2grp[ismpl] = igrp+1; // +1 to distinguish unlisted samples } khash_str2int_destroy(grp2idx); - if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", fname); + if ( !call->nsmpl_grp ) error("Could not parse the file, no matching samples found: %s\n", call->sample_groups); call->smpl_grp = (smpl_grp_t*)calloc(call->nsmpl_grp,sizeof(*call->smpl_grp)); for (i=0; ihdr->samples[i],fname); + if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups); int igrp = smpl2grp[i] - 1; if ( !call->smpl_grp[igrp].nsmpl ) call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t)); diff --git a/test/test.pl b/test/test.pl index 56af50836..ad233c8fd 100755 --- a/test/test.pl +++ b/test/test.pl @@ -278,10 +278,10 @@ test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.ped'); test_vcf_call($opts,in=>'mpileup.X',out=>'mpileup.X.2.out',args=>'-mv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.2.samples'); test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1.out',args=>'-mv'); -test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1b.out',args=>'-mv -G AD:-'); +test_vcf_call($opts,in=>'mpileup.NA19213.NA19129',out=>'mpileup.hwe.1b.out',args=>'-mv -G - --group-samples-tag AD'); test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.2.out',args=>'-mv'); -test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.3.out',args=>'-mv -G AD:-'); # 21,3,0 becomes 0/0 because of the prior -P -test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.4.out',args=>'-mv -G AD:{PATH}/mpileup.hwe.samples'); +test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.3.out',args=>'-mv -G - --group-samples-tag AD'); # 21,3,0 becomes 0/0 because of the prior -P +test_vcf_call($opts,in=>'mpileup.hwe',out=>'mpileup.hwe.4.out',args=>'-mv -G {PATH}/mpileup.hwe.samples --group-samples-tag AD'); test_vcf_call_cAls($opts,in=>'mpileup',out=>'mpileup.cAls.out',tab=>'mpileup'); test_vcf_call_cAls($opts,in=>'mpileup.2',out=>'mpileup.cAls.2.out',tab=>'mpileup.2'); test_vcf_call_cAls($opts,in=>'mpileup.3',out=>'mpileup.cAls.3.out',tab=>'mpileup.3',args=>'-i'); @@ -297,7 +297,7 @@ test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.ped'); test_vcf_call($opts,in=>'mpileup.c.X',out=>'mpileup.c.X.2.out',args=>'-cv --ploidy-file {PATH}/mpileup.ploidy -S {PATH}/mpileup.2.samples'); test_vcf_call($opts,in=>'call-G',out=>'call-G.1.out',args=>'-mv'); -test_vcf_call($opts,in=>'call-G',out=>'call-G.2.out',args=>'-mv -G AD:-'); +test_vcf_call($opts,in=>'call-G',out=>'call-G.2.out',args=>'-mv -G - --group-samples-tag AD'); test_vcf_call($opts,in=>'call-G.2',out=>'call-G.2.1.out',args=>'-mv -F AN_POP,AC_POP'); test_vcf_call($opts,in=>'call.af-fixation',out=>'call.af-fixation.1.out',args=>'-m'); test_vcf_call($opts,in=>'call.af-fixation',out=>'call.af-fixation.2.out',args=>'-m -G {PATH}/call.af-fixation.txt'); diff --git a/vcfcall.c b/vcfcall.c index 0bbcadbec..1379a8f60 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -740,7 +740,6 @@ static void destroy_data(args_t *args) free(args->samples_map); free(args->sample2sex); free(args->aux.ploidy); - free(args->aux.sample_groups); free(args->str.s); if ( args->gvcf ) gvcf_destroy(args->gvcf); bcf_hdr_destroy(args->aux.hdr); @@ -892,8 +891,9 @@ static void usage(args_t *args) // fprintf(stderr, " -a, --annots LIST Add annotations: GQ,GP,PV4 (lowercase allowed). Prefixed with ^ indicates a request for\n"); // fprintf(stderr, " tag removal [^I16,^QS,^FMT/QS]\n"); fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n"); - fprintf(stderr, " -G, --group-samples [TAG:]FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); + fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n"); fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n"); + fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n"); fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n"); fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n"); fprintf(stderr, " -M, --keep-masked-ref Keep sites with masked reference allele (REF=N)\n"); @@ -951,6 +951,7 @@ int main_vcfcall(int argc, char *argv[]) {"prior-freqs",required_argument,NULL,'F'}, {"gvcf",required_argument,NULL,'g'}, {"group-samples",required_argument,NULL,'G'}, + {"group-samples-tag",required_argument,NULL,3}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"regions",required_argument,NULL,'r'}, @@ -989,7 +990,8 @@ int main_vcfcall(int argc, char *argv[]) case 1 : ploidy = optarg; break; case 'X': ploidy = "X"; fprintf(stderr,"Warning: -X will be deprecated, please use --ploidy instead.\n"); break; case 'Y': ploidy = "Y"; fprintf(stderr,"Warning: -Y will be deprecated, please use --ploidy instead.\n"); break; - case 'G': args.aux.sample_groups = strdup(optarg); break; + case 'G': args.aux.sample_groups = optarg; break; + case 3 : args.aux.sample_groups_tag = optarg; break; case 'f': fprintf(stderr,"Warning: -f, --format-fields will be deprecated, please use -a, --annotate instead.\n"); case 'a': if (optarg[0]=='?') { list_annotations(stderr); return 1; } From 90c6048418e8bdce99603d12fde4093c93cf4a55 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 20 Jan 2021 08:33:17 +0000 Subject: [PATCH 30/81] Clarification of merge -R behavior. Resolves #1374 --- doc/bcftools.txt | 6 ++++-- vcfmerge.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 14530547e..c24b5f708 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1705,10 +1705,12 @@ For "vertical" merge take a look at *<>* or *<>* *-r, --regions* 'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: - see *<>* + see *<>*. Note that unlike in other commands, + records are restricted only by position and REF overlaps are neglected. *-R, --regions-file* 'file':: - see *<>* + see *<>*. Note that unlike in other commands, + records are restricted only by position and REF overlaps are neglected. *--threads* 'INT':: see *<>* diff --git a/vcfmerge.c b/vcfmerge.c index fd8af8d3a..0019430da 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -3102,8 +3102,8 @@ static void usage(void) fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); + fprintf(stderr, " -r, --regions restrict by POS to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file restrict by POS to regions listed in a file\n"); fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); fprintf(stderr, "\n"); exit(1); From 8aa9ac13e2e3cca190b7c95cae7fe6402be78256 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 20 Jan 2021 10:02:04 +0000 Subject: [PATCH 31/81] Update mpileup test to reflect https://github.com/samtools/htslib/commit/481bd2226195d3d6f9bc79f9b80d9750b2b43b76 --- test/mpileup/indel-AD.1.out | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/mpileup/indel-AD.1.out b/test/mpileup/indel-AD.1.out index 11fad7757..f33bde2b1 100644 --- a/test/mpileup/indel-AD.1.out +++ b/test/mpileup/indel-AD.1.out @@ -165,24 +165,24 @@ 000000F 535 . G <*> 0 . DP=125;I16=67,42,0,0,4349,188647,0,0,6540,392400,0,0,2476,59300,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:109,0 000000F 536 . T <*> 0 . DP=125;I16=61,42,0,0,4238,186282,0,0,6180,370800,0,0,2330,55772,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:103,0 000000F 537 . A <*> 0 . DP=125;I16=63,46,0,0,4401,190703,0,0,6540,392400,0,0,2484,59546,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:109,0 -000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=38,30,31,26,216,1072,1929,68145,4080,244800,3420,205200,1555,37389,1281,30547;QS=0.531796,0.468204;VDB=0.0330029;SGB=-0.693147;MQSB=1;MQ0F=0 PL:AD 255,0,255:68,57 +000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=38,30,31,26,215,1051,1927,68083,4080,244800,3420,205200,1555,37389,1281,30547;QS=0.532054,0.467946;VDB=0.0330029;SGB=-0.693147;MQSB=1;MQ0F=0 PL:AD 255,0,255:68,57 000000F 538 . C <*> 0 . DP=65;I16=34,19,0,0,2157,94505,0,0,3180,190800,0,0,1211,29361,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,160,255:53,0 -000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=33,34,36,22,206,950,1952,68106,4020,241200,3480,208800,1535,36845,1305,31339;QS=0.524714,0.475286;VDB=0.0281315;SGB=-0.693147;MQSB=1;MQ0F=0 PL:AD 255,0,255:67,58 +000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=33,34,36,22,205,939,1952,68106,4020,241200,3480,208800,1535,36845,1305,31339;QS=0.524714,0.475286;VDB=0.0281315;SGB=-0.693147;MQSB=1;MQ0F=0 PL:AD 255,0,255:67,58 000000F 539 . T <*> 0 . DP=60;I16=29,24,0,0,2172,96558,0,0,3180,190800,0,0,1199,28695,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,160,255:53,0 000000F 540 . G <*> 0 . DP=124;I16=62,40,0,0,4079,174277,0,0,6120,367200,0,0,2330,56376,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:102,0 000000F 541 . G <*> 0 . DP=124;I16=63,45,0,0,4180,177706,0,0,6480,388800,0,0,2457,59503,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:108,0 -000000F 542 . T <*> 0 . DP=124;I16=61,41,0,0,3930,162288,0,0,6120,367200,0,0,2321,56439,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:102,0 +000000F 542 . T <*> 0 . DP=124;I16=61,40,0,0,3942,164588,0,0,6060,363600,0,0,2296,55814,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:101,0 000000F 543 . C <*> 0 . DP=122;I16=60,42,0,0,3946,163434,0,0,6120,367200,0,0,2369,57467,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:102,0 000000F 544 . T <*> 0 . DP=121;I16=60,45,0,0,4162,176870,0,0,6300,378000,0,0,2442,59388,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:105,0 000000F 545 . G <*> 0 . DP=121;I16=58,45,0,0,4038,169874,0,0,6180,370800,0,0,2421,59221,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:103,0 000000F 546 . G <*> 0 . DP=121;I16=63,47,0,0,4266,177590,0,0,6600,396000,0,0,2546,61936,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:110,0 000000F 547 . A <*> 0 . DP=119;I16=62,44,0,0,4136,173460,0,0,6360,381600,0,0,2497,60853,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:106,0 -000000F 548 . C <*> 0 . DP=119;I16=62,41,0,0,3969,162355,0,0,6180,370800,0,0,2437,59493,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:103,0 +000000F 548 . C <*> 0 . DP=119;I16=61,41,0,0,3969,163237,0,0,6120,367200,0,0,2412,58868,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:102,0 000000F 549 . C A,<*> 0 . DP=117;I16=34,24,29,19,2184,85418,1915,83023,3480,208800,2880,172800,1363,33477,1116,26886;QS=0.538404,0.461596,0;VDB=0.30007;SGB=-0.693147;RPB=0.890173;MQB=1;MQSB=1;BQB=0.513155;MQ0F=0 PL:AD 255,0,255,255,255,255:58,48,0 000000F 550 . G T,<*> 0 . DP=117;I16=59,45,0,1,4109,172725,22,484,6240,374400,60,3600,2419,58857,25,625;QS=0.994559,0.0054415,0;SGB=-0.379885;RPB=1;MQB=1;MQSB=1;BQB=1;MQ0F=0 PL:AD 0,255,255,255,255,255:104,1,0 000000F 551 . G <*> 0 . DP=116;I16=59,45,0,0,4136,173538,0,0,6240,374400,0,0,2443,59581,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:104,0 -000000F 552 . A <*> 0 . DP=116;I16=54,48,0,0,3895,157609,0,0,6120,367200,0,0,2374,57728,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:102,0 -000000F 553 . G <*> 0 . DP=116;I16=58,47,0,0,4042,165082,0,0,6300,378000,0,0,2449,59783,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:105,0 +000000F 552 . A <*> 0 . DP=116;I16=54,47,0,0,3895,160643,0,0,6060,363600,0,0,2349,57103,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:101,0 +000000F 553 . G <*> 0 . DP=116;I16=58,46,0,0,4042,167788,0,0,6240,374400,0,0,2424,59158,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:104,0 000000F 554 . A <*> 0 . DP=113;I16=54,47,0,0,3991,164831,0,0,6060,363600,0,0,2416,58918,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:101,0 000000F 555 . G <*> 0 . DP=112;I16=58,45,0,0,4006,164296,0,0,6180,370800,0,0,2450,59688,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:103,0 000000F 556 . A <*> 0 . DP=112;I16=55,43,0,0,3826,155856,0,0,5880,352800,0,0,2313,56193,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:98,0 @@ -196,7 +196,7 @@ 000000F 564 . G <*> 0 . DP=109;I16=58,43,0,0,3956,162670,0,0,6060,363600,0,0,2314,56052,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:101,0 000000F 565 . A <*> 0 . DP=108;I16=54,41,0,0,3766,154308,0,0,5700,342000,0,0,2151,51943,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:95,0 000000F 566 . T <*> 0 . DP=108;I16=57,44,0,0,3990,164950,0,0,6060,363600,0,0,2284,55308,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:101,0 -000000F 567 . G <*> 0 . DP=108;I16=56,42,0,0,3823,154597,0,0,5880,352800,0,0,2227,54105,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:98,0 +000000F 567 . G <*> 0 . DP=108;I16=56,41,0,0,3823,157549,0,0,5820,349200,0,0,2202,53480,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:97,0 000000F 568 . T <*> 0 . DP=106;I16=54,43,0,0,3736,148330,0,0,5820,349200,0,0,2200,53288,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:97,0 000000F 569 . C <*> 0 . DP=104;I16=53,40,0,0,3686,150380,0,0,5580,334800,0,0,2135,51653,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:93,0 000000F 570 . T <*> 0 . DP=104;I16=54,45,0,0,3862,156134,0,0,5940,356400,0,0,2268,54950,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:99,0 @@ -213,7 +213,7 @@ 000000F 581 . C <*> 0 . DP=95;I16=48,42,0,0,3504,140196,0,0,5400,324000,0,0,2033,48757,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,255,255:90,0 000000F 582 . A <*> 0 . DP=94;I16=46,37,0,0,3342,137406,0,0,4980,298800,0,0,1889,45075,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,250,255:83,0 000000F 583 . G <*> 0 . DP=90;I16=44,40,0,0,3305,134325,0,0,5040,302400,0,0,1897,45227,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,253,255:84,0 -000000F 583 . GAAAAA GAAAA 0 . INDEL;IDV=2;IMF=0.0222222;DP=90;I16=47,41,1,1,977,18253,38,722,5280,316800,120,7200,1993,47543,50,1250;QS=0.977475,0.0225252;VDB=5.8801e-07;SGB=-0.453602;MQSB=1;MQ0F=0 PL:AD 0,230,192:88,2 +000000F 583 . GAAAAA GAAAA 0 . INDEL;IDV=2;IMF=0.0222222;DP=90;I16=47,41,1,1,975,18221,38,722,5280,316800,120,7200,1993,47543,50,1250;QS=0.977475,0.0225252;VDB=5.8801e-07;SGB=-0.453602;MQSB=1;MQ0F=0 PL:AD 0,230,192:88,2 000000F 584 . A <*> 0 . DP=87;I16=43,38,0,0,3130,126212,0,0,4860,291600,0,0,1842,43878,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,244,255:81,0 000000F 585 . A <*> 0 . DP=88;I16=43,37,0,0,3175,129709,0,0,4800,288000,0,0,1795,42431,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,241,255:80,0 000000F 586 . A <*> 0 . DP=88;I16=38,36,0,0,2877,116309,0,0,4440,266400,0,0,1664,39564,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,223,255:74,0 @@ -221,7 +221,7 @@ 000000F 588 . A <*> 0 . DP=87;I16=38,36,0,0,2862,114152,0,0,4440,266400,0,0,1623,37917,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,223,255:74,0 000000F 589 . G <*> 0 . DP=87;I16=44,37,0,0,3066,122368,0,0,4860,291600,0,0,1755,40979,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,244,255:81,0 000000F 590 . T <*> 0 . DP=87;I16=43,36,0,0,2960,115286,0,0,4740,284400,0,0,1681,39133,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,238,255:79,0 -000000F 591 . G <*> 0 . DP=86;I16=40,40,0,0,2985,114981,0,0,4800,288000,0,0,1710,39762,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,241,255:80,0 +000000F 591 . G <*> 0 . DP=86;I16=40,39,0,0,2985,116439,0,0,4740,284400,0,0,1685,39137,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,238,255:79,0 000000F 592 . A <*> 0 . DP=85;I16=39,31,0,0,2681,104999,0,0,4200,252000,0,0,1487,34521,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,211,255:70,0 000000F 593 . C <*> 0 . DP=84;I16=40,37,0,0,2864,109790,0,0,4620,277200,0,0,1614,37268,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,232,255:77,0 000000F 594 . T <*> 0 . DP=84;I16=35,37,0,0,2788,110382,0,0,4320,259200,0,0,1503,35071,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,217,255:72,0 @@ -254,7 +254,7 @@ 000000F 621 . C <*> 0 . DP=49;I16=23,21,0,0,1710,68988,0,0,2640,158400,0,0,891,21065,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,132,255:44,0 000000F 622 . T <*> 0 . DP=47;I16=19,21,0,0,1570,62196,0,0,2400,144000,0,0,816,19338,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,120,255:40,0 000000F 623 . A <*> 0 . DP=46;I16=21,20,0,0,1569,61081,0,0,2460,147600,0,0,812,19118,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,123,255:41,0 -000000F 624 . T <*> 0 . DP=45;I16=18,20,0,0,1438,55408,0,0,2280,136800,0,0,767,17931,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,114,255:38,0 +000000F 624 . T <*> 0 . DP=45;I16=19,19,0,0,1450,56440,0,0,2280,136800,0,0,742,17306,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,114,255:38,0 000000F 625 . C <*> 0 . DP=43;I16=17,21,0,0,1422,54590,0,0,2280,136800,0,0,788,18620,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,114,255:38,0 000000F 626 . T <*> 0 . DP=42;I16=16,19,0,0,1335,51875,0,0,2100,126000,0,0,750,17770,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,105,255:35,0 000000F 627 . A <*> 0 . DP=42;I16=17,21,0,0,1410,54084,0,0,2280,136800,0,0,781,18335,0,0;QS=1,0;MQSB=1;MQ0F=0 PL:AD 0,114,255:38,0 From 30dbc3c39ed92d3617fb0737ba533b1c97c3907e Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 20 Jan 2021 10:06:38 +0000 Subject: [PATCH 32/81] Make trio-dnm2 output log-scaled scores by default; add new `-u pns` parental noise threshold applied only if the variant allele is observed in a single parent, not both; autodetected output file type from the suffix (e.g. .bcf, .vcf, .vcf.gz) --- bcftools.h | 1 + plugins/trio-dnm2.c | 97 ++++++++++++++++++++++------------ test/trio-dnm/trio-dnm.4.1.out | 2 +- version.c | 10 +++- 4 files changed, 73 insertions(+), 37 deletions(-) diff --git a/bcftools.h b/bcftools.h index 07f1db35e..08e2a6768 100644 --- a/bcftools.h +++ b/bcftools.h @@ -50,6 +50,7 @@ void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd); const char *hts_bcf_wmode(int file_type); +const char *hts_bcf_wmode2(int file_type, char *fname); void *smalloc(size_t size); // safe malloc diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index af696239e..8b47855bd 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018-2020 Genome Research Ltd. + Copyright (c) 2018-2021 Genome Research Ltd. Author: Petr Danecek @@ -96,7 +96,8 @@ typedef struct char *dnm_score_tag; // the argument of --use tag, by default DNM:int int dnm_score_is_float; // given by e.g. --use tag DNM:float double mrate; // --use mrate, mutation rate - double pnoise_abs,pnoise_frac; // --use pn|pnoise + double pnoise_abs,pnoise_frac; // --use pn|pnoise or --use pns + int pnoise_strict; // set to 1 if pns was used or 0 if pn int use_ppl, use_ppl_qs; // --use ppl and --use ppl-qs int use_dng_priors; // --use dng-priors priors_t priors, priors_X, priors_XX; @@ -117,26 +118,27 @@ static const char *usage_text(void) "About: Screen variants for possible de-novo mutations in trios\n" "Usage: bcftools +trio-dnm2 [Plugin Options]\n" "Plugin options:\n" - " -e, --exclude EXPR exclude trios for which the expression is true (one matching sample invalidates a trio)\n" - " --force-AD calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" - " -i, --include EXPR include trios for which the expression is true (one failing samples invalidates a trio)\n" - " -m, --min-score NUM do not add FMT/DNM annotation if the score is smaller than NUM\n" - " -o, --output FILE output file name [stdout]\n" + " -e, --exclude EXPR Exclude trios for which the expression is true (one matching sample invalidates a trio)\n" + " --force-AD Calculate VAF even if the number of FMT/AD fields is incorrect. Use at your own risk!\n" + " -i, --include EXPR Include trios for which the expression is true (one failing samples invalidates a trio)\n" + " -m, --min-score NUM Do not add FMT/DNM annotation if the score is smaller than NUM\n" + " -o, --output FILE Output file name [stdout]\n" " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" - " -p, --pfm [1X:|2X:]P,F,M sample names of child (the proband), father, mother; \"1X:\" for male pattern of chrX inheritance [2X:]\n" + " -p, --pfm [1X:|2X:]P,F,M Sample names of child (the proband), father, mother; \"1X:\" for male pattern of chrX inheritance [2X:]\n" " -P, --ped FILE PED file with the columns: ,proband,father,mother,sex(1:male,2:female)\n" - " -r, --regions REG restrict to comma-separated list of regions\n" - " -R, --regions-file FILE restrict to regions listed in a file\n" - " -t, --targets REG similar to -r but streams rather than index-jumps\n" - " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" - " -u, --use OPTION[=VALUE] various options to tweak:\n" - " DNG use the original DeNovoGear model\n" - " dng-priors use the original DeNovoGear priors\n" - " mrate=NUM mutation rate for DNG and AC-DNG models [-u mrate=1e-8]\n" - " pn|pnoise=FRAC[,NUM] noise tolerance (or mosaicity) in parents, given as fraction of QS or number of reads [-u pn=0.045,0]\n" - " ppl use parental genotype likelihoods (FMT/PL rather than FMT/QS)\n" - " tag=TAG[:phred|log] annotation to add, either as phred quality (int) or log-scaled (float) [-u tag=DNM:phred]\n" - " -X, --chrX LIST regions with the chr X inheritance pattern or one of the predefined lists, exclude PARs [GRCh37]\n" + " -r, --regions REG Restrict to comma-separated list of regions\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " -t, --targets REG Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " -u, --use OPTION[=VALUE] Various options to tweak:\n" + " DNG Use the original DeNovoGear model, implies -u dng-priors\n" + " dng-priors Use the original DeNovoGear priors (including bugs in prior assignment)\n" + " mrate=NUM Mutation rate for DNG and AC-DNG models [-u mrate=1e-8]\n" + " pn=FRAC[,NUM] Tolerance to parental noise or mosaicity, given as fraction of QS or number of reads [-u pn=0,0]\n" + " pns=FRAC[,NUM] Same as `pn` but is not applied to alleles observed in both parents [-u pns=0.045,0]\n" + " ppl Use parental genotype likelihoods (FMT/PL rather than FMT/QS)\n" + " tag=TAG[:phred|log] Annotation to add, either as phred quality (int) or log-scaled (float) [-u tag=DNM:log]\n" + " -X, --chrX LIST Regions with the chr X inheritance pattern or one of the predefined lists, exclude PARs [GRCh37]\n" " GRCh37 .. X:1-60000,chrX:1-60000,X:2699521-154931043,chrX:2699521-154931043\n" " GRCh38 .. X:1-9999,chrX:1-9999,X:2781480-155701381,chrX:2781480-155701381\n" "\n" @@ -216,11 +218,21 @@ static void parse_ped(args_t *args, char *fname) } while ( hts_getline(fp, KS_SEP_LINE, &str)>=0 ); - fprintf(stderr,"Identified %d complete trio%s in the VCF file\n", args->ntrio,args->ntrio==1?"":"s"); - // sort the sample by index so that they are accessed more or less sequentially qsort(args->trio,args->ntrio,sizeof(trio_t),cmp_trios); + // check for duplicates + int i; + for (i=1; intrio; i++) + { + trio_t *ta = &args->trio[i-1]; + trio_t *tb = &args->trio[i]; + if ( ta->idx[0]==tb->idx[0] && ta->idx[1]==tb->idx[1] && ta->idx[2]==tb->idx[2] ) + error("Error: duplicate trio entries detected in the PED file: %s\n",fname); + } + + fprintf(stderr,"Identified %d complete trio%s in the VCF file\n", args->ntrio,args->ntrio==1?"":"s"); + free(str.s); free(off); if ( hts_close(fp)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,fname); @@ -646,7 +658,7 @@ static void init_data(args_t *args) args->chrX_idx = regidx_init_string(rmme, regidx_parse_reg, NULL, 0, NULL); free(rmme); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); @@ -741,7 +753,7 @@ static double process_trio_ACM(args_t *args, priors_t *priors, int nals, double else if ( fa==fb ) fpl += qs[iFATHER][i]; } - } + } int mi = 0; for (ma=0; mapprob[fi][mi][ci]; sum = sum_log(sum,val); +#define DEBUG 0 #if DEBUG if(val!=-HUGE_VAL) fprintf(stderr,"m,f,c: %d%d+%d%d=%d%d dn=%d (%d,%d,%d) mpl,fpl,cpl: %+e %+e %+e \t prior:%+e \t pval=%+e sum=%+e %c\n", @@ -961,7 +974,7 @@ static void process_record(args_t *args, bcf1_t *rec) { if ( args->filter && !args->trio[i].pass ) continue; - // Samples can be in any other in the VCF, set PL and QS to reflect the iFATHER,iMOTHER,iCHILD indices + // Samples can be in any order in the VCF, set PL and QS to reflect the iFATHER,iMOTHER,iCHILD indices double *ppl[3]; double *pqs[3]; for (j=0; j<3; j++) // set trio PLs @@ -974,30 +987,41 @@ static void process_record(args_t *args, bcf1_t *rec) } if ( args->use_model&USE_ACM ) // set trio QS { + int32_t *ad_f = NULL, *ad_m = NULL; + if ( args->pnoise_strict && args->ad ) + { + // apply noise tolerance for alleles observed in a single parent only + ad_f = args->ad + n_ad * args->trio[i].idx[iFATHER]; + ad_m = args->ad + n_ad * args->trio[i].idx[iMOTHER]; + } for (j=0; j<3; j++) { int32_t *ad = (args->pnoise_abs && args->ad ) ? args->ad + n_ad * args->trio[i].idx[j] : NULL; int32_t *qs = args->qs + nqs1 * args->trio[i].idx[j]; - double *dst = pqs[j] = args->qs3 + j*nqs1; + pqs[j] = args->qs3 + j*nqs1; double noise_tolerance = 0; + double sum_qs = 0, sum_ad = 0; if ( j!=iCHILD ) { - double sum_qs = 0, sum_ad = 0; for (k=0; kpnoise_frac; if ( ad ) { for (k=0; kpnoise_abs * sum_qs / sum_ad ) - noise_tolerance = args->pnoise_abs * sum_qs / sum_ad; + if ( args->pnoise_abs ) + { + if ( noise_tolerance < args->pnoise_abs * sum_qs / sum_ad ) + noise_tolerance = args->pnoise_abs * sum_qs / sum_ad; + } } } for (k=0; kpnoise_strict || !ad_f[k] || !ad_m[k] ) val -= noise_tolerance; if ( val < 0 ) val = 0; if ( val > 255 ) val = 255; - dst[k] = phred2log(val); + pqs[j][k] = phred2log(val); } } if ( args->use_ppl_qs ) @@ -1073,9 +1097,9 @@ static void set_option(args_t *args, char *optarg) args->mrate = strtod(val,&tmp); if ( *tmp ) error("Could not parse: -u %s\n", optarg); } - else if ( !strcasecmp(opt,"pn") || !strcasecmp(opt,"pnoise") ) + else if ( !strcasecmp(opt,"pn") || !strcasecmp(opt,"pnoise") || !strcasecmp(opt,"pns") ) { - if ( !val ) error("Error: expected value with -u pnoise, e.g. -u pnoise=0.05\n"); + if ( !val ) error("Error: expected value with -u %s, e.g. -u %s=0.05\n",opt,opt); args->pnoise_frac = strtod(val,&tmp); if ( *tmp && *tmp==',' ) { @@ -1084,6 +1108,7 @@ static void set_option(args_t *args, char *optarg) } if ( args->pnoise_frac<0 || args->pnoise_frac>1 ) error("Error: expected value from the interval [0,1] for -u %s\n", optarg); if ( args->pnoise_abs<0 ) error("Error: expected positive value for -u %s\n", optarg); + args->pnoise_strict = !strcasecmp(opt,"pn") ? 0 : 1; } else if ( !strcasecmp(opt,"DNG") ) { args->use_model = USE_DNG; args->use_dng_priors = 1; } else if ( !strcasecmp(opt,"dng-priors") ) args->use_dng_priors = 1; @@ -1104,9 +1129,11 @@ int run(int argc, char **argv) args->output_fname = "-"; args->dnm_score_tag = strdup("DNM:phred"); args->mrate = 1e-8; - args->pnoise_frac = 0.045; - args->pnoise_abs = 0; + args->pnoise_frac = 0.045; + args->pnoise_abs = 0; + args->pnoise_strict = 1; args->use_model = USE_ACM; + args->dnm_score_is_float = 1; static struct option loptions[] = { {"chrX",required_argument,0,'X'}, diff --git a/test/trio-dnm/trio-dnm.4.1.out b/test/trio-dnm/trio-dnm.4.1.out index 817198513..77de513cb 100644 --- a/test/trio-dnm/trio-dnm.4.1.out +++ b/test/trio-dnm/trio-dnm.4.1.out @@ -1 +1 @@ - 255 . . 55 0 0 + 0 . . 55 0 0 diff --git a/version.c b/version.c index 19cec91ac..abbeb5033 100644 --- a/version.c +++ b/version.c @@ -60,7 +60,6 @@ void error_errno(const char *format, ...) exit(-1); } - const char *hts_bcf_wmode(int file_type) { if ( file_type == FT_BCF ) return "wbu"; // uncompressed BCF @@ -69,4 +68,13 @@ const char *hts_bcf_wmode(int file_type) return "w"; // uncompressed VCF } +const char *hts_bcf_wmode2(int file_type, char *fname) +{ + int len = strlen(fname); + if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); + if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); + if ( len >= 7 && !strcasecmp(".vcf.gz",fname+len-7) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + if ( len >= 8 && !strcasecmp(".vcf.bgz",fname+len-8) ) return hts_bcf_wmode(FT_VCF|FT_GZ); + return hts_bcf_wmode(file_type); +} From 7496b17e2c17704accdb7fdc858689e0d96fa79b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 20 Jan 2021 10:24:30 +0000 Subject: [PATCH 33/81] Autodetected output file type from the output file suffix (e.g. .bcf, .vcf, .vcf.gz) --- csq.c | 4 ++-- mpileup.c | 4 ++-- plugins/contrast.c | 4 ++-- plugins/gvcfz.c | 4 ++-- plugins/isecGT.c | 4 ++-- plugins/mendelian.c | 4 ++-- plugins/prune.c | 4 ++-- plugins/remove-overlaps.c | 4 ++-- plugins/scatter.c | 4 ++-- plugins/split-vep.c | 4 ++-- plugins/split.c | 4 ++-- vcfannotate.c | 4 ++-- vcfcall.c | 4 ++-- vcfconcat.c | 4 ++-- vcfconvert.c | 14 +++++++------- vcffilter.c | 4 ++-- vcfisec.c | 6 +++--- vcfmerge.c | 4 ++-- vcfnorm.c | 4 ++-- vcfplugin.c | 4 ++-- vcfsort.c | 4 ++-- version.c | 3 ++- 22 files changed, 50 insertions(+), 49 deletions(-) diff --git a/csq.c b/csq.c index 4c01a14d1..f1b3aa4a9 100644 --- a/csq.c +++ b/csq.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016-2020 Genome Research Ltd. + Copyright (c) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -1404,7 +1404,7 @@ void init_data(args_t *args) } else { - args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to %s: %s\n", __func__,args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads > 0) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); diff --git a/mpileup.c b/mpileup.c index 324ef577f..75ba5873d 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1,6 +1,6 @@ /* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools - Copyright (C) 2008-2020 Genome Research Ltd. + Copyright (C) 2008-2021 Genome Research Ltd. Portions copyright (C) 2009-2012 Broad Institute. Author: Heng Li @@ -499,7 +499,7 @@ static int mpileup(mplp_conf_t *conf) fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header - conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); + conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode2(conf->output_type,conf->output_fname)); if (conf->bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); diff --git a/plugins/contrast.c b/plugins/contrast.c index bdc30483f..81f914437 100644 --- a/plugins/contrast.c +++ b/plugins/contrast.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018-2020 Genome Research Ltd. + Copyright (c) 2018-2021 Genome Research Ltd. Author: Petr Danecek @@ -213,7 +213,7 @@ static void init_data(args_t *args) read_sample_list_or_file(args->hdr, args->control_samples_str, &args->control_smpl, &args->ncontrol_smpl, args->force_samples); read_sample_list_or_file(args->hdr, args->case_samples_str, &args->case_smpl, &args->ncase_smpl, args->force_samples); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); diff --git a/plugins/gvcfz.c b/plugins/gvcfz.c index f169d3977..07d826c23 100644 --- a/plugins/gvcfz.c +++ b/plugins/gvcfz.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -366,7 +366,7 @@ int run(int argc, char **argv) if ( args->filter_str ) args->filter = filter_init(args->hdr_in, args->filter_str); init_groups(args); - args->fh_out = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->fh_out = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( bcf_hdr_write(args->fh_out, args->hdr_out)!=0 ) error("Failed to write the header\n"); while ( bcf_sr_next_line(args->sr) ) process_gvcf(args); flush_block(args, NULL); diff --git a/plugins/isecGT.c b/plugins/isecGT.c index d256c58ef..5383045e9 100644 --- a/plugins/isecGT.c +++ b/plugins/isecGT.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -130,7 +130,7 @@ int run(int argc, char **argv) args->hdr_a = bcf_sr_get_header(args->sr,0); args->hdr_b = bcf_sr_get_header(args->sr,1); smpl_ilist_t *smpl = smpl_ilist_map(args->hdr_a, args->hdr_b, SMPL_STRICT); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr_a)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); diff --git a/plugins/mendelian.c b/plugins/mendelian.c index f75dce052..d1fe4442f 100644 --- a/plugins/mendelian.c +++ b/plugins/mendelian.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2015-2018 Genome Research Ltd. + Copyright (c) 2015-2021 Genome Research Ltd. Author: Petr Danecek @@ -441,7 +441,7 @@ int run(int argc, char **argv) args.hdr = bcf_sr_get_header(args.sr, 0); if ( args.mode!=MODE_COUNT ) { - args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); + args.out_fh = hts_open(args.output_fname,hts_bcf_wmode2(args.output_type,args.output_fname)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); if ( args.mode&MODE_ANNOTATE ) bcf_hdr_append(args.hdr, "##INFO="); diff --git a/plugins/prune.c b/plugins/prune.c index 59a680bc6..5d669f17b 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2020 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -128,7 +128,7 @@ static void init_data(args_t *args) if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr = bcf_sr_get_header(args->sr,0); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->ld_filter && strcmp(".",args->ld_filter) ) diff --git a/plugins/remove-overlaps.c b/plugins/remove-overlaps.c index cc786eaf0..034bd49f2 100644 --- a/plugins/remove-overlaps.c +++ b/plugins/remove-overlaps.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2020 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -95,7 +95,7 @@ static void init_data(args_t *args) if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr = bcf_sr_get_header(args->sr,0); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); diff --git a/plugins/scatter.c b/plugins/scatter.c index 624928b54..7ce524fef 100644 --- a/plugins/scatter.c +++ b/plugins/scatter.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2020 Giulio Genovese + Copyright (C) 2020-2021 Giulio Genovese Author: Giulio Genovese @@ -175,7 +175,7 @@ static void open_set(subset_t *set, args_t *args) if ( args->output_type & FT_BCF ) kputs(".bcf", &args->str); else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &args->str); else kputs(".vcf", &args->str); - set->fh = hts_open(args->str.s, hts_bcf_wmode(args->output_type)); + set->fh = hts_open(args->str.s, hts_bcf_wmode2(args->output_type,args->str.s)); if ( set->fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, args->str.s, strerror(errno)); if ( args->n_threads > 0) hts_set_opt(set->fh, HTS_OPT_THREAD_POOL, args->sr->p); diff --git a/plugins/split-vep.c b/plugins/split-vep.c index 73dee9559..acee389cb 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2019-2020 Genome Research Ltd. + Copyright (c) 2019-2021 Genome Research Ltd. Author: Petr Danecek @@ -1067,7 +1067,7 @@ int run(int argc, char **argv) args->fh_bgzf = bgzf_open(args->output_fname, args->output_type&FT_GZ ? "wg" : "wu"); else { - args->fh_vcf = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->fh_vcf = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_split-vep"); if ( bcf_hdr_write(args->fh_vcf, args->hdr_out)!=0 ) error("Failed to write the header to %s\n", args->output_fname); } diff --git a/plugins/split.c b/plugins/split.c index 4211fde18..f2a5c54bc 100644 --- a/plugins/split.c +++ b/plugins/split.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2020 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -429,7 +429,7 @@ static void init_data(args_t *args) if ( args->output_type & FT_BCF ) kputs(".bcf", &str); else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); else kputs(".vcf", &str); - set->fh = hts_open(str.s, hts_bcf_wmode(args->output_type)); + set->fh = hts_open(str.s, hts_bcf_wmode2(args->output_type,str.s)); if ( set->fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, str.s, strerror(errno)); if ( args->hts_opts ) { diff --git a/vcfannotate.c b/vcfannotate.c index ddb5a88f6..238120ac7 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -1,6 +1,6 @@ /* vcfannotate.c -- Annotate and edit VCF/BCF files. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -2635,7 +2635,7 @@ static void init_data(args_t *args) if ( args->rename_chrs ) rename_chrs(args, args->rename_chrs); if ( args->rename_annots ) rename_annots(args, args->rename_annots); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__,args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); diff --git a/vcfcall.c b/vcfcall.c index 1379a8f60..03cd918bd 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -1,6 +1,6 @@ /* vcfcall.c -- SNP/indel variant calling from VCF/BCF. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -687,7 +687,7 @@ static void init_data(args_t *args) if ( args->aux.flag & CALL_CONSTR_ALLELES ) args->vcfbuf = vcfbuf_init(args->aux.hdr, 0); - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Error: cannot write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); diff --git a/vcfconcat.c b/vcfconcat.c index 4bdfb51d3..185e527af 100644 --- a/vcfconcat.c +++ b/vcfconcat.c @@ -1,6 +1,6 @@ /* vcfconcat.c -- Concatenate or combine VCF/BCF files. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -116,7 +116,7 @@ static void init_data(args_t *args) bcf_hdr_append(args->out_hdr,"##FORMAT="); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->allow_overlaps || args->phased_concat ) { diff --git a/vcfconvert.c b/vcfconvert.c index 93b212628..3694a29da 100644 --- a/vcfconvert.c +++ b/vcfconvert.c @@ -1,6 +1,6 @@ /* vcfconvert.c -- convert between VCF/BCF and related formats. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -394,7 +394,7 @@ static void gensample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -522,7 +522,7 @@ static void haplegendsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->outfname); @@ -636,7 +636,7 @@ static void hapsample_to_vcf(args_t *args) for (i=0; ioutfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1224,7 +1224,7 @@ static void tsv_to_vcf(args_t *args) bcf_hdr_add_sample(args->header, NULL); args->gts = (int32_t *) malloc(sizeof(int32_t)*n*2); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if ( bcf_hdr_write(out_fh,args->header)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname); @@ -1276,7 +1276,7 @@ static void tsv_to_vcf(args_t *args) static void vcf_to_vcf(args_t *args) { open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); @@ -1305,7 +1305,7 @@ static void gvcf_to_vcf(args_t *args) if ( !args->ref ) error("Could not load the fai index for reference %s\n", args->ref_fname); open_vcf(args,NULL); - htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode(args->output_type)); + htsFile *out_fh = hts_open(args->outfname,hts_bcf_wmode2(args->output_type,args->outfname)); if ( out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->outfname, strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); diff --git a/vcffilter.c b/vcffilter.c index b725ed087..1ac555564 100644 --- a/vcffilter.c +++ b/vcffilter.c @@ -1,6 +1,6 @@ /* vcffilter.c -- Apply fixed-threshold filters. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -80,7 +80,7 @@ args_t; static void init_data(args_t *args) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); diff --git a/vcfisec.c b/vcfisec.c index 261841ce6..4cad61ded 100644 --- a/vcfisec.c +++ b/vcfisec.c @@ -1,6 +1,6 @@ /* vcfisec.c -- Create intersections, unions and complements of VCF files. - Copyright (C) 2012-2019 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -141,7 +141,7 @@ void isec_vcf(args_t *args) if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { - out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); + out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode2(args->output_type,args->output_fname)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); @@ -356,7 +356,7 @@ static void init_data(args_t *args) #define OPEN_FILE(i,j) { \ open_file(&args->fnames[i], NULL, "%s/%04d.%s", args->prefix, i, suffix); \ - args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode(args->output_type)); \ + args->fh_out[i] = hts_open(args->fnames[i], hts_bcf_wmode2(args->output_type,args->fnames[i])); \ if ( !args->fh_out[i] ) error("Could not open %s\n", args->fnames[i]); \ if ( args->n_threads ) hts_set_threads(args->fh_out[i], args->n_threads); \ if (args->record_cmd_line) bcf_hdr_append_version(args->files->readers[j].header,args->argc,args->argv,"bcftools_isec"); \ diff --git a/vcfmerge.c b/vcfmerge.c index 0019430da..f31fe98b1 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -1,6 +1,6 @@ /* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -3007,7 +3007,7 @@ void hdr_add_localized_tags(args_t *args, bcf_hdr_t *hdr) } void merge_vcf(args_t *args) { - args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads); args->out_hdr = bcf_hdr_init("w"); diff --git a/vcfnorm.c b/vcfnorm.c index bd30b57e9..24e68e2a6 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -1917,7 +1917,7 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) static void normalize_vcf(args_t *args) { - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); diff --git a/vcfplugin.c b/vcfplugin.c index e4f0a8124..74a96e08b 100644 --- a/vcfplugin.c +++ b/vcfplugin.c @@ -1,6 +1,6 @@ /* vcfplugin.c -- plugin modules for operating on VCF/BCF files. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -522,7 +522,7 @@ static void init_data(args_t *args) if (args->record_cmd_line) bcf_hdr_append_version(args->hdr_out, args->argc, args->argv, "bcftools_plugin"); if ( !args->drop_header ) { - args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); + args->out_fh = hts_open(args->output_fname,hts_bcf_wmode2(args->output_type,args->output_fname)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); if ( bcf_hdr_write(args->out_fh, args->hdr_out)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); diff --git a/vcfsort.c b/vcfsort.c index ea63ab8d6..f56dbed3e 100644 --- a/vcfsort.c +++ b/vcfsort.c @@ -1,6 +1,6 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2020 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -227,7 +227,7 @@ void merge_blocks(args_t *args) blk_read(args, bhp, args->hdr, blk); } - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); + htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); if ( bcf_hdr_write(out, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,args->output_fname); while ( bhp->ndat ) { diff --git a/version.c b/version.c index abbeb5033..b340373f8 100644 --- a/version.c +++ b/version.c @@ -1,6 +1,6 @@ /* version.c -- report version numbers for plugins. - Copyright (C) 2014 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -70,6 +70,7 @@ const char *hts_bcf_wmode(int file_type) const char *hts_bcf_wmode2(int file_type, char *fname) { + if ( !fname ) return hts_bcf_wmode(file_type); int len = strlen(fname); if ( len >= 4 && !strcasecmp(".bcf",fname+len-4) ) return hts_bcf_wmode(FT_BCF|FT_GZ); if ( len >= 4 && !strcasecmp(".vcf",fname+len-4) ) return hts_bcf_wmode(FT_VCF); From 4f733bc23a2ca24080aa2bc8a0c8fc1d391d3016 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 20 Jan 2021 10:54:16 +0000 Subject: [PATCH 34/81] Make `merge -R` behavior consistent with other commands and pull in overlapping records with POS outside of the regions. Temptative fix for #1374, revers documentation change of 8aa9ac13e2 --- doc/bcftools.txt | 6 ++---- vcfmerge.c | 10 +++------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index c24b5f708..14530547e 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1705,12 +1705,10 @@ For "vertical" merge take a look at *<>* or *<>* *-r, --regions* 'chr'|'chr:pos'|'chr:from-to'|'chr:from-'[,...]:: - see *<>*. Note that unlike in other commands, - records are restricted only by position and REF overlaps are neglected. + see *<>* *-R, --regions-file* 'file':: - see *<>*. Note that unlike in other commands, - records are restricted only by position and REF overlaps are neglected. + see *<>* *--threads* 'INT':: see *<>* diff --git a/vcfmerge.c b/vcfmerge.c index f31fe98b1..637e1b910 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -2901,13 +2901,9 @@ void stage_line(args_t *args) void merge_line(args_t *args) { - if ( args->regs ) - { - if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return; - } - bcf1_t *out = args->out_line; merge_chrom2qual(args, out); + if ( args->regs && !regidx_overlap(args->regs,args->maux->chr,out->pos,out->pos+out->rlen-1,NULL) ) return; merge_filter(args, out); merge_info(args, out); if ( args->do_gvcf ) @@ -3102,8 +3098,8 @@ static void usage(void) fprintf(stderr, " --no-version do not append version and command line to the header\n"); fprintf(stderr, " -o, --output write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict by POS to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict by POS to regions listed in a file\n"); + fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); fprintf(stderr, "\n"); exit(1); From b66541ec4b84433e522768707bad373b15514c84 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 22 Jan 2021 15:02:46 +0000 Subject: [PATCH 35/81] Add new `+prune -N, --nsites-per-win-mode` mode. Resolves #1050 --- plugins/prune.c | 20 ++++++++++++++++--- test/prune.1.6.out | 11 +++++++++++ test/prune.1.7.out | 11 +++++++++++ test/test.pl | 4 +++- vcfbuf.c | 49 +++++++++++++++++++++++++++++++++++++++------- vcfbuf.h | 3 ++- 6 files changed, 86 insertions(+), 12 deletions(-) create mode 100644 test/prune.1.6.out create mode 100644 test/prune.1.7.out diff --git a/plugins/prune.c b/plugins/prune.c index 5d669f17b..baecf0e50 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -61,6 +61,7 @@ typedef struct char *ld_annot[VCFBUF_LD_N], *ld_annot_pos[VCFBUF_LD_N]; int ld_mask; int argc, region_is_file, target_is_file, output_type, ld_filter_id, rand_missing, nsites, ld_win; + char *nsites_mode; int keep_sites; char **argv, *region, *target, *fname, *output_fname, *ld_filter; htsFile *out_fh; @@ -89,7 +90,8 @@ static const char *usage_text(void) " -i, --include EXPR include only sites for which the expression is true\n" " -k, --keep-sites leave sites filtered by -i/-e unchanged instead of discarding them\n" " -m, --max [r2|LD=]FLOAT remove sites with r2 or Lewontin's D bigger than FLOAT within the -w window\n" - " -n, --nsites-per-win N keep at most N sites in the -w window, removing sites with small AF first\n" + " -n, --nsites-per-win N keep at most N sites in the -w window. See also -N, --nsites-per-win-mode\n" + " -N, --nsites-per-win-mode STR keep sites with biggest AF (\"maxAF\"); sites that come first (\"1st\"); pick randomly (\"rand\") [maxAF]\n" " -o, --output FILE write output to the FILE [standard output]\n" " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" " --randomize-missing replace missing data with randomly assigned genotype based on site's allele frequency\n" @@ -185,7 +187,11 @@ static void init_data(args_t *args) if ( args->ld_max_set[VCFBUF_LD_IDX_LD] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_LD,args->ld_max[VCFBUF_LD_IDX_LD]); if ( args->ld_max_set[VCFBUF_LD_IDX_HD] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_HD,args->ld_max[VCFBUF_LD_IDX_HD]); if ( args->rand_missing ) vcfbuf_set_opt(args->vcfbuf,int,LD_RAND_MISSING,1); - if ( args->nsites ) vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_NSITES,args->nsites); + if ( args->nsites ) + { + vcfbuf_set_opt(args->vcfbuf,int,VCFBUF_NSITES,args->nsites); + vcfbuf_set_opt(args->vcfbuf,char*,VCFBUF_NSITES_MODE,args->nsites_mode); + } if ( args->af_tag ) vcfbuf_set_opt(args->vcfbuf,char*,VCFBUF_AF_TAG,args->af_tag); if ( args->filter_str ) @@ -266,6 +272,7 @@ int run(int argc, char **argv) args->output_type = FT_VCF; args->output_fname = "-"; args->ld_win = -100e3; + args->nsites_mode = "maxAF"; static struct option loptions[] = { {"keep-sites",no_argument,NULL,'k'}, @@ -281,12 +288,13 @@ int run(int argc, char **argv) {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"nsites-per-win",required_argument,NULL,'n'}, + {"nsites-per-win-mode",required_argument,NULL,'N'}, {"window",required_argument,NULL,'w'}, {NULL,0,NULL,0} }; int c; char *tmp; - while ((c = getopt_long(argc, argv, "vr:R:t:T:m:o:O:a:f:i:e:n:w:k",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "vr:R:t:T:m:o:O:a:f:i:e:n:N:w:k",loptions,NULL)) >= 0) { switch (c) { @@ -329,6 +337,12 @@ int run(int argc, char **argv) args->nsites = strtod(optarg,&tmp); if ( tmp==optarg || *tmp ) error("Could not parse: --nsites-per-win %s\n", optarg); break; + case 'N': + if ( !strcasecmp(optarg,"maxAF") ) args->nsites_mode = optarg; + else if ( !strcasecmp(optarg,"1st") ) args->nsites_mode = optarg; + else if ( !strcasecmp(optarg,"rand") ) args->nsites_mode = optarg; + else error("The mode \"%s\" is not recognised\n",optarg); + break; case 'm': if ( !strncasecmp("R2=",optarg,3) ) { diff --git a/test/prune.1.6.out b/test/prune.1.6.out new file mode 100644 index 000000000..d3ab4d6a0 --- /dev/null +++ b/test/prune.1.6.out @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://some/path/human_g1k_v37.fasta +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3 +1 101 . T A . . AF=0.3 GT 0/1 0/1 0/1 +1 103 . T A . . AF=0.1 GT 0/1 0/0 0/0 +1 105 . T A . . AF=0.2 GT 0/0 0/0 0/0 +1 107 . T A . . AF=0.3 GT 0/1 0/0 0/1 diff --git a/test/prune.1.7.out b/test/prune.1.7.out new file mode 100644 index 000000000..3682ac60a --- /dev/null +++ b/test/prune.1.7.out @@ -0,0 +1,11 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file://some/path/human_g1k_v37.fasta +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3 +1 101 . T A . . AF=0.3 GT 0/1 0/1 0/1 +1 104 . T A . . AF=0.3 GT 0/0 0/0 0/0 +1 105 . T A . . AF=0.2 GT 0/0 0/0 0/0 +1 107 . T A . . AF=0.3 GT 0/1 0/0 0/1 diff --git a/test/test.pl b/test/test.pl index ad233c8fd..233d549be 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2020 Genome Research Ltd. +# Copyright (C) 2012-2021 Genome Research Ltd. # # Author: Petr Danecek # @@ -536,6 +536,8 @@ test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.3.out',cmd=>'+prune -w 2 -a r2 -m 0.5 '); # prune within 2bp, max r2=0.5 test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.4.out',cmd=>'+prune -w 2bp -n 1 --AF-tag AF'); # leave 1 site within 2bp windows, prioritize by AF test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.5.out',cmd=>q[+prune -w 2bp -n 1 --AF-tag AF -i 'GT="alt"']); # same as above but first discard REF-only sites +test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.6.out',cmd=>'+prune -w 2bp -n 1 -N 1st'); +test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.7.out',cmd=>'+prune -w 2bp -n 1 -N rand'); test_plugin_split($opts,in=>'split.1',out=>'split.1.1.out',tmp=>'split.1.1'); test_plugin_split($opts,in=>'split.1',out=>'split.1.2.out',tmp=>'split.1.2',args=>'-S {PATH}/split.smpl.1.2.txt'); test_plugin_split($opts,in=>'split.1',out=>'split.1.3.out',tmp=>'split.1.3',args=>'-S {PATH}/split.smpl.1.3.txt'); diff --git a/vcfbuf.c b/vcfbuf.c index 13eeea9fb..a1fe76657 100644 --- a/vcfbuf.c +++ b/vcfbuf.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2016-2020 Genome Research Ltd. + Copyright (c) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -46,9 +46,12 @@ typedef struct } vcfrec_t; +#define PRUNE_MODE_MAX_AF 1 +#define PRUNE_MODE_1ST 2 +#define PRUNE_MODE_RAND 3 typedef struct { - int max_sites, mvrec, mac, mfarr; + int max_sites, mvrec, mac, mfarr, mode; int *ac, *idx; float *farr; char *af_tag; @@ -113,10 +116,24 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } - if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); return; } + if ( key==VCFBUF_NSITES ) + { + buf->prune.max_sites = *((int*)value); + if ( !buf->prune.mode ) buf->prune.mode = PRUNE_MODE_MAX_AF; + return; + } if ( key==VCFBUF_AF_TAG ) { buf->prune.af_tag = *((char**)value); return; } if ( key==VCFBUF_OVERLAP_WIN ) { buf->overlap.active = *((int*)value); return; } if ( key==VCFBUF_RMDUP) { buf->rmdup.active = *((int*)value); return; } + + if ( key==VCFBUF_NSITES_MODE ) + { + char *mode = *((char**)value); + if ( !strcasecmp(mode,"maxAF") ) buf->prune.mode = PRUNE_MODE_MAX_AF; + else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST; + else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND; + else error("The mode \"%s\" is not recognised\n",mode); + } } int vcfbuf_nsites(vcfbuf_t *buf) @@ -176,6 +193,26 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) { int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1; + int nprune = nbuf - buf->prune.max_sites; + int i,k,irec = 0; + if ( buf->prune.mode==PRUNE_MODE_1ST ) + { + int eoff = flush_all ? 1 : 2; + for (i=0; irbuf, vcfrec_t, buf->rbuf.n - eoff, buf->vcf); + return; + } + if ( buf->prune.mode==PRUNE_MODE_RAND ) + { + int eoff = flush_all ? 0 : 1; + for (i=0; irbuf.n - eoff) * (double)rand() / RAND_MAX; + rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf); + } + return; + } + if ( nbuf > buf->prune.mvrec ) { buf->prune.idx = (int*) realloc(buf->prune.idx, nbuf*sizeof(int)); @@ -184,7 +221,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) } // set allele frequency and prepare buffer for sorting - int i,k,irec = 0; for (i=-1; rbuf_next(&buf->rbuf,&i) && irecvcf[i].rec; @@ -217,7 +253,6 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) // sort the rbuf indexes to be pruned descendently so that j-th rbuf index // is removed before i-th index if iprune.max_sites; for (i=0; iprune.idx[i] = buf->prune.vrec[i]->idx; @@ -399,7 +434,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l if ( aptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( rand()/RAND_MAX >= aaf ) adsg += 1; + if ( (double)rand()/RAND_MAX >= aaf ) adsg += 1; } else if ( bcf_gt_allele(aptr[j]) ) adsg += 1; an++; @@ -410,7 +445,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l if ( bptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( rand()/RAND_MAX >= baf ) bdsg += 1; + if ( (double)rand()/RAND_MAX >= baf ) bdsg += 1; } else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1; bn++; diff --git a/vcfbuf.h b/vcfbuf.h index 658be3a4d..d3be6c53c 100644 --- a/vcfbuf.h +++ b/vcfbuf.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2017-2020 Genome Research Ltd. + Copyright (c) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -41,6 +41,7 @@ typedef enum VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window VCFBUF_RMDUP, // remove duplicate sites (completely) VCFBUF_NSITES, // leave at max this many sites in the window + VCFBUF_NSITES_MODE, // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly) VCFBUF_AF_TAG, // use this INFO tag with VCFBUF_NSITES // LD related options From 0020d0277220cba3f91f518d597021e9b13786ac Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 22 Jan 2021 16:52:09 +0000 Subject: [PATCH 36/81] Add new --mark-del, --mark-ins, and --mark-snv options. Resolves #1381 and #1170 --- consensus.c | 172 ++++++++++++++++++++++++++++++++---------- doc/bcftools.txt | 9 +++ test/consensus.15.fa | 20 +++++ test/consensus.15.out | 20 +++++ test/test.pl | 1 + 5 files changed, 182 insertions(+), 40 deletions(-) create mode 100644 test/consensus.15.fa create mode 100644 test/consensus.15.out diff --git a/consensus.c b/consensus.c index 48d0e015f..ceee2ab8e 100644 --- a/consensus.c +++ b/consensus.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2020 Genome Research Ltd. + Copyright (c) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -106,6 +106,7 @@ typedef struct char **argv; int argc, output_iupac, haplotype, allele, isample, napplied; char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; + char mark_del, mark_ins, mark_snv; } args_t; @@ -398,6 +399,46 @@ static void freeze_ref(args_t *args, bcf1_t *rec) args->fa_frz_pos = rec->pos + rec->rlen - 1; args->fa_frz_mod = rec->pos - args->fa_ori_pos + args->fa_mod_off + rec->rlen; } +static char *mark_del(char *ref, int rlen, char *alt, int mark) +{ + char *out = malloc(rlen+1); + int i; + if ( alt ) + { + int nalt = strlen(alt); + for (i=0; i + { + int nref = strlen(ref); + for (i=0; id.allele[ialt]; + int rmme_alt = 0; + int len_diff = 0, alen = 0; int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off; if ( idx<0 ) @@ -623,10 +667,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( rec->rlen > args->fa_buf.l - idx ) { rec->rlen = args->fa_buf.l - idx; - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); if ( alen > rec->rlen ) { - rec->d.allele[ialt][rec->rlen] = 0; + alt_allele[rec->rlen] = 0; fprintf(stderr,"Warning: trimming variant starting at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } } @@ -634,15 +678,15 @@ static void apply_variant(args_t *args, bcf1_t *rec) error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off); // sanity check the reference base - if ( rec->d.allele[ialt][0]=='<' ) + if ( alt_allele[0]=='<' ) { // TODO: symbolic deletions probably need more work above with PICK_SHORT|PICK_LONG - if ( strcasecmp(rec->d.allele[ialt],"") && strcasecmp(rec->d.allele[ialt],"<*>") && strcasecmp(rec->d.allele[ialt],"") ) + if ( strcasecmp(alt_allele,"") && strcasecmp(alt_allele,"<*>") && strcasecmp(alt_allele,"") ) error("Symbolic alleles other than , <*> or are currently not supported, e.g. %s at %s:%"PRId64".\n" "Please use filtering expressions to exclude such sites, for example by running with: -e 'ALT~\"<.*>\"'\n", - rec->d.allele[ialt],bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( !strcasecmp(rec->d.allele[ialt],"") ) + alt_allele,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( !strcasecmp(alt_allele,"") ) { static int multibase_ref_del_warned = 0; if ( rec->d.allele[0][1]!=0 && !multibase_ref_del_warned ) @@ -652,10 +696,19 @@ static void apply_variant(args_t *args, bcf1_t *rec) " (This warning is printed only once.)\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); multibase_ref_del_warned = 1; } - - len_diff = 1-rec->rlen; - rec->d.allele[ialt] = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event - alen = 1; + if ( args->mark_del ) // insert dashes instead of delete sequence + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, NULL, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } + else + { + len_diff = 1-rec->rlen; + alt_allele = rec->d.allele[0]; // according to VCF spec, the first REF base must precede the event + alen = 1; + } } else { @@ -691,24 +744,45 @@ static void apply_variant(args_t *args, bcf1_t *rec) " .vcf: [%s] <- (REF)\n" " .vcf: [%s] <- (ALT)\n" " .fa: [%s]%c%s\n", - bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, + bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); } - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); len_diff = alen - rec->rlen; + + if ( args->mark_del && len_diff<0 ) + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } } else { - alen = strlen(rec->d.allele[ialt]); + alen = strlen(alt_allele); len_diff = alen - rec->rlen; + + if ( args->mark_del && len_diff<0 ) + { + alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del); + alen = rec->rlen; + len_diff = 0; + rmme_alt = 1; + } } args->fa_case = toupper(args->fa_buf.s[idx])==args->fa_buf.s[idx] ? TO_UPPER : TO_LOWER; if ( args->fa_case==TO_UPPER ) - for (i=0; id.allele[ialt][i] = toupper(rec->d.allele[ialt][i]); + for (i=0; id.allele[ialt][i] = tolower(rec->d.allele[ialt][i]); + for (i=0; imark_ins && len_diff>0 ) + mark_ins(rec->d.allele[0], alt_allele, args->mark_ins); + if ( args->mark_snv ) + mark_snv(rec->d.allele[0], alt_allele, args->mark_snv); if ( len_diff <= 0 ) { @@ -720,7 +794,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) args->fa_frz_mod = idx + alen; for (i=trim_beg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; + args->fa_buf.s[idx+i] = alt_allele[i]; if ( len_diff ) memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen); @@ -740,16 +814,16 @@ static void apply_variant(args_t *args, bcf1_t *rec) // 1 C T // 1 C CAA int ibeg = 0; - while ( ibegd.allele[0][ibeg]==rec->d.allele[ialt][ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; + while ( ibegd.allele[0][ibeg]==alt_allele[ibeg] && rec->pos + ibeg <= args->prev_base_pos ) ibeg++; for (i=ibeg; ifa_buf.s[idx+i] = rec->d.allele[ialt][i]; + args->fa_buf.s[idx+i] = alt_allele[i]; args->fa_frz_mod = idx + alen - ibeg + 1; } if (args->chain && len_diff != 0) { // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant) - if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0) + if ( strncasecmp(rec->d.allele[0],alt_allele,1) == 0) { // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1); @@ -764,6 +838,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) args->fa_mod_off += len_diff; args->fa_frz_pos = rec->pos + rec->rlen - 1; args->napplied++; + if ( rmme_alt ) free(alt_allele); } @@ -894,26 +969,29 @@ static void usage(args_t *args) fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n"); fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --chain write a chain file for liftover\n"); - fprintf(stderr, " -a, --absent replace positions absent from VCF with \n"); - fprintf(stderr, " -e, --exclude exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -f, --fasta-ref reference sequence in fasta format\n"); - fprintf(stderr, " -H, --haplotype choose which allele to use from the FORMAT/GT field, note\n"); - fprintf(stderr, " the codes are case-insensitive:\n"); - fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); - fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); - fprintf(stderr, " R: REF allele in het genotypes\n"); - fprintf(stderr, " A: ALT allele\n"); - fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); - fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); - fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(stderr, " -i, --include select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(stderr, " -m, --mask replace regions with N\n"); - fprintf(stderr, " -M, --missing output instead of skipping a missing genotype \"./.\"\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -p, --prefix prefix to add to output sequence names\n"); - fprintf(stderr, " -s, --sample apply variants of the given sample\n"); + fprintf(stderr, " -c, --chain FILE write a chain file for liftover\n"); + fprintf(stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n"); + fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n"); + fprintf(stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(stderr, " the codes are case-insensitive:\n"); + fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); + fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); + fprintf(stderr, " R: REF allele in het genotypes\n"); + fprintf(stderr, " A: ALT allele\n"); + fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); + fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); + fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); + fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); + fprintf(stderr, " --mark-ins uc|lc highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " -m, --mask FILE replace regions with N\n"); + fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); + fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); + fprintf(stderr, " -s, --sample NAME apply variants of the given sample\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(stderr, " # in the form \">chr:from-to\".\n"); @@ -929,6 +1007,9 @@ int main_consensus(int argc, char *argv[]) static struct option loptions[] = { + {"mark-del",required_argument,NULL,1}, + {"mark-ins",required_argument,NULL,2}, + {"mark-snv",required_argument,NULL,3}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, @@ -948,6 +1029,17 @@ int main_consensus(int argc, char *argv[]) { switch (c) { + case 1 : args->mark_del = optarg[0]; break; + case 2 : + if ( !strcasecmp(optarg,"uc") ) args->mark_ins = 'u'; + else if ( !strcasecmp(optarg,"lc") ) args->mark_ins = 'l'; + else error("The argument is not recognised: --mark-ins %s\n",optarg); + break; + case 3 : + if ( !strcasecmp(optarg,"uc") ) args->mark_snv = 'u'; + else if ( !strcasecmp(optarg,"lc") ) args->mark_snv = 'l'; + else error("The argument is not recognised: --mark-snv %s\n",optarg); + break; case 'p': args->chr_prefix = optarg; break; case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 14530547e..27798bde0 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -850,6 +850,15 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-I, --iupac-codes*:: output variants in the form of IUPAC ambiguity codes +*--mark-del* 'CHAR':: + instead of removing sequence, insert CHAR for deletions + +*--mark-ins* 'uc'|'lc':: + highlight inserted sequence in upper (uc) or lower (lc) case, leaving the rest of the sequence as is + +*--mark-snv* 'uc'|'lc':: + highlight substitutions in upper (uc) or lower (lc) case, leaving the rest of the sequence as is + *-m, --mask* 'FILE':: BED file or TAB file with regions to be replaced with N. See discussion of *--regions-file* in *<>* for file diff --git a/test/consensus.15.fa b/test/consensus.15.fa new file mode 100644 index 000000000..d9a9619d0 --- /dev/null +++ b/test/consensus.15.fa @@ -0,0 +1,20 @@ +>1:2-501 +taccatatgtgacatataaaaaagaacataacctacgtatcaactaaagtggttgtttg +cagaaaaggaagacttaaaaagagtcagtactaacctacataatatatacaatgttcatt +aaataataaaatgagctcatcatacttaggtcatcataaatatatctgaaattcacaaat +attgatcaaatggtaaaatagacaagtagattttaataggttaaacaattactgattctc +ttgaaagaataaatttaatatgagacctatttcattataatgaactcacaaattagaaac +ttcacactgggggctggagagatggctcagtagttaagaacactgactgctcttctgaag +gtcctgagttcaaatcccagcaaccacatggtgacttacaaccatctgtaatgacatctg +atgccctctggtgtgtctgaagacagctacagtgtacttacataaaataataaataaatc +tttaaaaacaaaaaaaaagaa +>2 +gaagatcttttccttattaaggatctgaagctctgtagatttgtattctattaaacatgg +agagattagtgattttccatattctttaagtcattttagagtaatgtgttcttaagataa +atcagaaaaacaaaaacttgtgctttcctgtttgaaaaacaaacagctgtggggaatggt +gtcgggacagcctttttataaaatttttctaaataatgttgaggctttgatacgtcaaag +ttatatttcaaatggaatcacttagacctcgtttctgagtgtcaatggccatattgggga +tttgctgctgccaatgacagcacaccctgggaatgccccaactacttactacaaagcagt +gttacatggagaagatcttcaagagtctttttgctagatctttccttggcttttgatgtg +actcctctcaataaaatccacagtaatatagtgagtggtctcctgctccaaaccagtatt +tcagacacagttaatccagac diff --git a/test/consensus.15.out b/test/consensus.15.out new file mode 100644 index 000000000..f8b95469f --- /dev/null +++ b/test/consensus.15.out @@ -0,0 +1,20 @@ +>1:2-501 +TACAAAATATga--tAAAATCAaAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTT +TGAAGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCA +TTAAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAA +ATATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTC +TCTTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAA +ACTTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGA +AGGTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATC +TGATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAA +TCTTTAAAAACAAAAAAAAAGAA +>2 +gaagatcttttccttattaaggatctgaagctctgtagatttgtattctattaaacatgg +AA--attagtgattttccatattctttaagtcattttagagtaatgtgttcttaagatT- +-tcagaaaaacaaaaacttgtgctttcctgtttgaaaaacaaacagctgtggggaatgGA +CGTACGTtgtcgggacagcctttttatA----------aaataatgttgaggctttgata +cgtcaaagttatatttcaaatggaatcacttagacctcgtttctgagtgtcaatggccat +attggggAtttgctgctgccaatgacaGcacaccctgggaatgccccaactacttactac +aaagcagtgttacatggagaagatcttcaagagtctttttgctagatctttccttggctt +ttgatgtgactcctctcaataaaatccacagtaatatagtgagtggtctcctgctccaaa +ccagtattCcagacacagttaatccagac diff --git a/test/test.pl b/test/test.pl index 233d549be..2bde53228 100755 --- a/test/test.pl +++ b/test/test.pl @@ -625,6 +625,7 @@ test_vcf_consensus($opts,in=>'consensus.12',out=>'consensus.12.out',fa=>'consensus.12.fa',args=>''); test_vcf_consensus($opts,in=>'consensus.13',out=>'consensus.13.out',fa=>'consensus.13.fa',args=>''); test_vcf_consensus($opts,in=>'consensus.14',out=>'consensus.14.out',fa=>'consensus.14.fa',args=>''); +test_vcf_consensus($opts,in=>'consensus.12',out=>'consensus.15.out',fa=>'consensus.12.fa',args=>'--mark-del - --mark-ins uc --mark-snv uc'); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite test_mpileup($opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite From 94e91e25a0c379c090390a552c39b32a3f740065 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Sat, 23 Jan 2021 16:01:06 +0000 Subject: [PATCH 37/81] Add --random-seed option to make tests reproducible --- plugins/prune.c | 15 ++++++++++++++- test/test.pl | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/plugins/prune.c b/plugins/prune.c index baecf0e50..6405dfcea 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "bcftools.h" #include "vcfbuf.h" #include "filter.h" @@ -60,7 +61,7 @@ typedef struct int ld_max_set[VCFBUF_LD_N]; char *ld_annot[VCFBUF_LD_N], *ld_annot_pos[VCFBUF_LD_N]; int ld_mask; - int argc, region_is_file, target_is_file, output_type, ld_filter_id, rand_missing, nsites, ld_win; + int argc, region_is_file, target_is_file, output_type, ld_filter_id, rand_missing, nsites, ld_win, rseed; char *nsites_mode; int keep_sites; char **argv, *region, *target, *fname, *output_fname, *ld_filter; @@ -94,6 +95,7 @@ static const char *usage_text(void) " -N, --nsites-per-win-mode STR keep sites with biggest AF (\"maxAF\"); sites that come first (\"1st\"); pick randomly (\"rand\") [maxAF]\n" " -o, --output FILE write output to the FILE [standard output]\n" " -O, --output-type b|u|z|v b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n" + " --random-seed INT use the provided random seed for reproducibility\n" " --randomize-missing replace missing data with randomly assigned genotype based on site's allele frequency\n" " -r, --regions REGION restrict to comma-separated list of regions\n" " -R, --regions-file FILE restrict to regions listed in a file\n" @@ -273,11 +275,13 @@ int run(int argc, char **argv) args->output_fname = "-"; args->ld_win = -100e3; args->nsites_mode = "maxAF"; + args->rseed = time(NULL); static struct option loptions[] = { {"keep-sites",no_argument,NULL,'k'}, {"randomize-missing",no_argument,NULL,1}, {"AF-tag",required_argument,NULL,2}, + {"random-seed",required_argument,NULL,3}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"annotate",required_argument,NULL,'a'}, @@ -300,6 +304,10 @@ int run(int argc, char **argv) { case 1 : args->rand_missing = 1; break; case 2 : args->af_tag = optarg; break; + case 3 : + args->rseed = strtol(optarg,&tmp,10); + if ( tmp==optarg || *tmp ) error("Could not parse: --random-seed %s\n", optarg); + break; case 'k': args->keep_sites = 1; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; @@ -399,6 +407,11 @@ int run(int argc, char **argv) if ( !args->ld_mask && !args->nsites ) error("%sError: Expected pruning (--max,--nsites-per-win) or annotation (--annotate) options\n\n", usage_text()); if ( args->ld_filter && strcmp(".",args->ld_filter) && !(args->ld_mask & LD_SET_MAX) ) error("The --set-filter option requires --max.\n"); if ( args->keep_sites && args->nsites ) error("The --keep-sites option cannot be combined with --nsites-per-win\n"); + if ( args->rand_missing || (args->nsites_mode && !strcasecmp(args->nsites_mode,"rand")) ) + { + fprintf(stderr,"Using random seed: %d\n",args->rseed); + srand(args->rseed); + } if ( optind==argc ) { diff --git a/test/test.pl b/test/test.pl index 2bde53228..caff67ef4 100755 --- a/test/test.pl +++ b/test/test.pl @@ -537,7 +537,7 @@ test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.4.out',cmd=>'+prune -w 2bp -n 1 --AF-tag AF'); # leave 1 site within 2bp windows, prioritize by AF test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.5.out',cmd=>q[+prune -w 2bp -n 1 --AF-tag AF -i 'GT="alt"']); # same as above but first discard REF-only sites test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.6.out',cmd=>'+prune -w 2bp -n 1 -N 1st'); -test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.7.out',cmd=>'+prune -w 2bp -n 1 -N rand'); +test_vcf_plugin($opts,in=>'prune.1',out=>'prune.1.7.out',cmd=>'+prune -w 2bp -n 1 -N rand --random-seed 1'); test_plugin_split($opts,in=>'split.1',out=>'split.1.1.out',tmp=>'split.1.1'); test_plugin_split($opts,in=>'split.1',out=>'split.1.2.out',tmp=>'split.1.2',args=>'-S {PATH}/split.smpl.1.2.txt'); test_plugin_split($opts,in=>'split.1',out=>'split.1.3.out',tmp=>'split.1.3',args=>'-S {PATH}/split.smpl.1.3.txt'); From 61bab7529f6ceb7d07e143ba0ff1d21ec4aa5982 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 25 Jan 2021 09:47:49 +0000 Subject: [PATCH 38/81] Switch to PCG to ensure test reproducibility across platforms --- Makefile | 8 ++-- pcg.c | 116 +++++++++++++++++++++++++++++++++++++++++++++ pcg.h | 80 +++++++++++++++++++++++++++++++ plugins/prune.c | 10 ++-- test/prune.1.7.out | 8 ++-- vcfbuf.c | 14 ++++-- vcfbuf.h | 2 + vcfgtcheck.c | 16 +------ 8 files changed, 225 insertions(+), 29 deletions(-) create mode 100644 pcg.c create mode 100644 pcg.h diff --git a/Makefile b/Makefile index 0c247d8b4..a1894b38b 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,8 @@ OBJS = main.o vcfindex.o tabix.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o \ - ccall.o em.o prob1.o kmin.o # the original samtools calling + ccall.o em.o prob1.o kmin.o \ + pcg.o PLUGIN_OBJS = vcfplugin.o prefix = /usr/local @@ -239,7 +240,7 @@ vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reade vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(bcftools_h) vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h) vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h -vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(bcftools_h) extsort.h +vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(bcftools_h) extsort.h pcg.h vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h) vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h) @@ -279,7 +280,8 @@ bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_st version.o: version.h version.c hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h HMM.o: HMM.c $(htslib_hts_h) HMM.h -vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(bcftools_h) $(vcfbuf_h) rbuf.h +vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(bcftools_h) $(vcfbuf_h) rbuf.h pcg.h +pcg.o: pcg.c extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h) csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h diff --git a/pcg.c b/pcg.c new file mode 100644 index 000000000..89e00645c --- /dev/null +++ b/pcg.c @@ -0,0 +1,116 @@ +/* + * PCG Random Number Generation for C. + * + * Copyright 2014 Melissa O'Neill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For additional information about the PCG random number generation scheme, + * including its license and other licensing options, visit + * + * http://www.pcg-random.org + */ + +/* + * This code is derived from the full C implementation, which is in turn + * derived from the canonical C++ PCG implementation. The C++ version + * has many additional features and is preferable if you can use C++ in + * your project. + */ + +#include "pcg.h" + +// state for global RNGs + +static pcg32_random_t pcg32_global = PCG32_INITIALIZER; + +// pcg32_srandom(initstate, initseq) +// pcg32_srandom_r(rng, initstate, initseq): +// Seed the rng. Specified in two parts, state initializer and a +// sequence selection constant (a.k.a. stream id) + +void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, uint64_t initseq) +{ + rng->state = 0U; + rng->inc = (initseq << 1u) | 1u; + pcg32_random_r(rng); + rng->state += initstate; + pcg32_random_r(rng); +} + +void pcg32_srandom(uint64_t seed, uint64_t seq) +{ + pcg32_srandom_r(&pcg32_global, seed, seq); +} + +// pcg32_random() +// pcg32_random_r(rng) +// Generate a uniformly distributed 32-bit random number + +uint32_t pcg32_random_r(pcg32_random_t* rng) +{ + uint64_t oldstate = rng->state; + rng->state = oldstate * 6364136223846793005ULL + rng->inc; + uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; + uint32_t rot = oldstate >> 59u; + return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); +} + +uint32_t pcg32_random() +{ + return pcg32_random_r(&pcg32_global); +} + + +// pcg32_boundedrand(bound): +// pcg32_boundedrand_r(rng, bound): +// Generate a uniformly distributed number, r, where 0 <= r < bound + +uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound) +{ + // To avoid bias, we need to make the range of the RNG a multiple of + // bound, which we do by dropping output less than a threshold. + // A naive scheme to calculate the threshold would be to do + // + // uint32_t threshold = 0x100000000ull % bound; + // + // but 64-bit div/mod is slower than 32-bit div/mod (especially on + // 32-bit platforms). In essence, we do + // + // uint32_t threshold = (0x100000000ull-bound) % bound; + // + // because this version will calculate the same modulus, but the LHS + // value is less than 2^32. + + uint32_t threshold = -bound % bound; + + // Uniformity guarantees that this loop will terminate. In practice, it + // should usually terminate quickly; on average (assuming all bounds are + // equally likely), 82.25% of the time, we can expect it to require just + // one iteration. In the worst case, someone passes a bound of 2^31 + 1 + // (i.e., 2147483649), which invalidates almost 50% of the range. In + // practice, bounds are typically small and only a tiny amount of the range + // is eliminated. + for (;;) { + uint32_t r = pcg32_random_r(rng); + if (r >= threshold) + return r % bound; + } +} + + +uint32_t pcg32_boundedrand(uint32_t bound) +{ + return pcg32_boundedrand_r(&pcg32_global, bound); +} + diff --git a/pcg.h b/pcg.h new file mode 100644 index 000000000..db13d8cad --- /dev/null +++ b/pcg.h @@ -0,0 +1,80 @@ +/* + * PCG Random Number Generation for C. + * + * Copyright 2014 Melissa O'Neill + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For additional information about the PCG random number generation scheme, + * including its license and other licensing options, visit + * + * http://www.pcg-random.org + */ + +/* + * This code is derived from the full C implementation, which is in turn + * derived from the canonical C++ PCG implementation. The C++ version + * has many additional features and is preferable if you can use C++ in + * your project. + */ + +#ifndef PCG_BASIC_H_INCLUDED +#define PCG_BASIC_H_INCLUDED 1 + +#include + +#if __cplusplus +extern "C" { +#endif + +struct pcg_state_setseq_64 { // Internals are *Private*. + uint64_t state; // RNG state. All values are possible. + uint64_t inc; // Controls which RNG sequence (stream) is + // selected. Must *always* be odd. +}; +typedef struct pcg_state_setseq_64 pcg32_random_t; + +#define PCG32_RAND_MAX UINT32_MAX + +// If you *must* statically initialize it, here's one. + +#define PCG32_INITIALIZER { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL } + +// pcg32_srandom(initstate, initseq) +// pcg32_srandom_r(rng, initstate, initseq): +// Seed the rng. Specified in two parts, state initializer and a +// sequence selection constant (a.k.a. stream id) + +void pcg32_srandom(uint64_t initstate, uint64_t initseq); +void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, + uint64_t initseq); + +// pcg32_random() +// pcg32_random_r(rng) +// Generate a uniformly distributed 32-bit random number + +uint32_t pcg32_random(void); +uint32_t pcg32_random_r(pcg32_random_t* rng); + +// pcg32_boundedrand(bound): +// pcg32_boundedrand_r(rng, bound): +// Generate a uniformly distributed number, r, where 0 <= r < bound + +uint32_t pcg32_boundedrand(uint32_t bound); +uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound); + +#if __cplusplus +} +#endif + +#endif // PCG_BASIC_H_INCLUDED diff --git a/plugins/prune.c b/plugins/prune.c index 6405dfcea..de4f63cd8 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -188,6 +188,11 @@ static void init_data(args_t *args) if ( args->ld_max_set[VCFBUF_LD_IDX_R2] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_R2,args->ld_max[VCFBUF_LD_IDX_R2]); if ( args->ld_max_set[VCFBUF_LD_IDX_LD] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_LD,args->ld_max[VCFBUF_LD_IDX_LD]); if ( args->ld_max_set[VCFBUF_LD_IDX_HD] ) vcfbuf_set_opt(args->vcfbuf,double,LD_MAX_HD,args->ld_max[VCFBUF_LD_IDX_HD]); + if ( args->rand_missing || (args->nsites_mode && !strcasecmp(args->nsites_mode,"rand")) ) + { + fprintf(stderr,"Using random seed: %d\n",args->rseed); + vcfbuf_set_opt(args->vcfbuf,double,RANDOM_SEED,args->rseed); + } if ( args->rand_missing ) vcfbuf_set_opt(args->vcfbuf,int,LD_RAND_MISSING,1); if ( args->nsites ) { @@ -407,11 +412,6 @@ int run(int argc, char **argv) if ( !args->ld_mask && !args->nsites ) error("%sError: Expected pruning (--max,--nsites-per-win) or annotation (--annotate) options\n\n", usage_text()); if ( args->ld_filter && strcmp(".",args->ld_filter) && !(args->ld_mask & LD_SET_MAX) ) error("The --set-filter option requires --max.\n"); if ( args->keep_sites && args->nsites ) error("The --keep-sites option cannot be combined with --nsites-per-win\n"); - if ( args->rand_missing || (args->nsites_mode && !strcasecmp(args->nsites_mode,"rand")) ) - { - fprintf(stderr,"Using random seed: %d\n",args->rseed); - srand(args->rseed); - } if ( optind==argc ) { diff --git a/test/prune.1.7.out b/test/prune.1.7.out index 3682ac60a..4e3841842 100644 --- a/test/prune.1.7.out +++ b/test/prune.1.7.out @@ -5,7 +5,7 @@ ##FORMAT= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3 -1 101 . T A . . AF=0.3 GT 0/1 0/1 0/1 -1 104 . T A . . AF=0.3 GT 0/0 0/0 0/0 -1 105 . T A . . AF=0.2 GT 0/0 0/0 0/0 -1 107 . T A . . AF=0.3 GT 0/1 0/0 0/1 +1 102 . T A . . AF=0.2 GT 0/1 0/1 0/1 +1 103 . T A . . AF=0.1 GT 0/1 0/0 0/0 +1 106 . T A . . AF=0.1 GT 0/1 1/1 1/1 +1 108 . T A . . AF=0.2 GT 0/1 1/1 0/1 diff --git a/vcfbuf.c b/vcfbuf.c index a1fe76657..c0e720a83 100644 --- a/vcfbuf.c +++ b/vcfbuf.c @@ -29,6 +29,7 @@ #include #include "bcftools.h" #include "vcfbuf.h" +#include "pcg.h" #include "rbuf.h" typedef struct @@ -81,6 +82,7 @@ struct _vcfbuf_t prune_t prune; overlap_t overlap; rmdup_t rmdup; + pcg32_random_t rng; }; vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) @@ -116,6 +118,12 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } + if ( key==RANDOM_SEED ) + { + uint64_t seed = *((uint64_t*)value); + pcg32_srandom_r(&buf->rng, seed, seed); + } + if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); @@ -207,7 +215,7 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) int eoff = flush_all ? 0 : 1; for (i=0; irbuf.n - eoff) * (double)rand() / RAND_MAX; + int j = (buf->rbuf.n - eoff) * (double)pcg32_random_r(&buf->rng) / PCG32_RAND_MAX; rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf); } return; @@ -434,7 +442,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l if ( aptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( (double)rand()/RAND_MAX >= aaf ) adsg += 1; + if ( (double)pcg32_random_r(&buf->rng)/PCG32_RAND_MAX >= aaf ) adsg += 1; } else if ( bcf_gt_allele(aptr[j]) ) adsg += 1; an++; @@ -445,7 +453,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l if ( bptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( (double)rand()/RAND_MAX >= baf ) bdsg += 1; + if ( (double)pcg32_random_r(&buf->rng)/PCG32_RAND_MAX >= baf ) bdsg += 1; } else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1; bn++; diff --git a/vcfbuf.h b/vcfbuf.h index d3be6c53c..c011d03d8 100644 --- a/vcfbuf.h +++ b/vcfbuf.h @@ -44,6 +44,8 @@ typedef enum VCFBUF_NSITES_MODE, // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly) VCFBUF_AF_TAG, // use this INFO tag with VCFBUF_NSITES + RANDOM_SEED, // initialize random seed generator used in LD_RAND_MISSING and VCFBUF_NSITES_MODE=rand + // LD related options LD_RAND_MISSING, // randomize rather than ignore missing genotypes LD_FILTER1, // exclude the next record inserted by vcfbuf_push() from LD analysis diff --git a/vcfgtcheck.c b/vcfgtcheck.c index 75dc026ec..099b4aea4 100644 --- a/vcfgtcheck.c +++ b/vcfgtcheck.c @@ -42,6 +42,7 @@ THE SOFTWARE. */ #include #include "bcftools.h" #include "extsort.h" +#include "pcg.h" //#include "hclust.h" typedef struct @@ -178,22 +179,9 @@ static inline void diff_sites_reset(args_t *args) { kbs_clear(args->kbs_diff); } -/* - Generage a 32-bit random number, taken from - https://www.pcg-random.org/download.html#minimal-c-implementation -*/ -typedef struct { uint64_t state; uint64_t inc; } pcg32_random_t; -static uint32_t pcg32_random_r(pcg32_random_t* rng) -{ - uint64_t oldstate = rng->state; - rng->state = oldstate * 6364136223846793005ULL + (rng->inc|1); - uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; - uint32_t rot = oldstate >> 59u; - return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); -} static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos) { - static pcg32_random_t rng = { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL }; + static pcg32_random_t rng = PCG32_INITIALIZER; diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size); memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added) dat->ndiff = ndiff; From 81e11c2f0e29ee9616fab472a001615615cb7d18 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 25 Jan 2021 12:24:16 +0000 Subject: [PATCH 39/81] Add forgotten \0 string termination --- consensus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/consensus.c b/consensus.c index ceee2ab8e..d7c6de235 100644 --- a/consensus.c +++ b/consensus.c @@ -414,6 +414,7 @@ static char *mark_del(char *ref, int rlen, char *alt, int mark) for (i=0; i Date: Mon, 25 Jan 2021 12:43:38 +0000 Subject: [PATCH 40/81] More informative error message, see #294 --- vcfannotate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vcfannotate.c b/vcfannotate.c index 238120ac7..6ebe8cdb4 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -501,7 +501,7 @@ static int setter_filter(args_t *args, bcf1_t *line, annot_col_t *col, void *dat if ( tab->cols[col->icol] && tab->cols[col->icol][0]=='.' && !tab->cols[col->icol][1] ) return 0; // don't replace with "." hts_expand(int,1,args->mtmpi,args->tmpi); args->tmpi[0] = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, tab->cols[col->icol]); - if ( args->tmpi[0]<0 ) error("The FILTER is not defined in the header: %s\n", tab->cols[col->icol]); + if ( args->tmpi[0]<0 ) error("The FILTER \"%s\" is not defined in the header, was the -h option provided?\n", tab->cols[col->icol]); if ( col->replace==SET_OR_APPEND ) return bcf_add_filter(args->hdr_out,line,args->tmpi[0]); if ( col->replace!=REPLACE_MISSING ) { @@ -2267,7 +2267,7 @@ static void init_columns(args_t *args) } int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) ) - error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname); args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols); annot_col_t *col = &args->cols[args->ncols-1]; memset(col,0,sizeof(*col)); @@ -2376,7 +2376,7 @@ static void init_columns(args_t *args) *ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '='; error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s); } - error("The tag \"%s\" is not defined in %s\n", key_src,args->files->readers[1].fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname); } tmp.l = 0; bcf_hrec_format_rename(hrec, key_dst, &tmp); @@ -2387,7 +2387,7 @@ static void init_columns(args_t *args) hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst); } else - error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname); + error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); From 3d8741c864ece1043891fdf0368a99bb55e72908 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 27 Jan 2021 08:46:26 +0000 Subject: [PATCH 41/81] Make the behavior of --hapsample and --hapsample2vcf consistent with each other and with the documentation by 1) --hapsample producing .samples rather than .sample file 2) make the first column CHROM:POS_REF_ALT rather than CHROM. Note that this breaks the HAPS format which expects the first column to be an integer: https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html#hapsample However, this assumption is broken already by emitting whatever CHROM happens to be, e.g. "X" or "chr19". Resolves #1385 --- test/convert.hs.hap | 64 ++++++++++++++++++++++----------------------- vcfconvert.c | 4 +-- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/test/convert.hs.hap b/test/convert.hs.hap index a9cbbdd80..47589ade6 100644 --- a/test/convert.hs.hap +++ b/test/convert.hs.hap @@ -1,32 +1,32 @@ -X X:2698560_G_A 2698560 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698630_A_G 2698630 A G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698758_CAA_C 2698758 CAA C 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698769_AAG_A 2698769 AAG A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 -X X:2698789_C_G 2698789 C G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698822_A_C 2698822 A C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698831_G_A 2698831 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698889_T_C 2698889 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698923_G_A 2698923 G A 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 -X X:2698953_A_AGG 2698953 A AGG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2698954_G_A 2698954 G A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 -X X:2699002_C_A 2699002 C A 0 0 0 0 ? ? 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2699025_T_C 2699025 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2699091_G_A 2699091 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2699187_T_C 2699187 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0* 1* -X X:2699188_G_C 2699188 G C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X X:2699189_T_C 2699189 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X X:2699217_C_T 2699217 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2699246_C_A 2699246 C A 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 -X X:2699275_T_G 2699275 T G 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 -X X:2699350_A_T 2699350 A T 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X X:2699360_T_C 2699360 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X X:2699450_A_C 2699450 A C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 -X X:2699507_T_C 2699507 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -X X:2699555_C_A 2699555 C A 0 - 1 - 1 - 0 - 1 - 1 1 1 0 0 1 0 0 0 1 -X X:2699645_G_T 2699645 G T 0 - 1 - 0 - 0 - 0 - 0 0 0 0 0 1 0 0 0 0 -X X:2699676_G_A 2699676 G A 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 -X X:2699728_C_T 2699728 C T 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 -X X:2699775_C_A 2699775 C A 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 -X X:2699898_C_CT 2699898 C CT 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 -X X:2699968_A_G 2699968 A G ? - 0 - 0 - 1 - 0 - 0 1 0 0 0 0 0 1 1 0 -X X:2699970_T_C 2699970 T C 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X:2698560_G_A X:2698560_G_A 2698560 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698630_A_G X:2698630_A_G 2698630 A G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698758_CAA_C X:2698758_CAA_C 2698758 CAA C 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698769_AAG_A X:2698769_AAG_A 2698769 AAG A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X:2698789_C_G X:2698789_C_G 2698789 C G 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698822_A_C X:2698822_A_C 2698822 A C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698831_G_A X:2698831_G_A 2698831 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698889_T_C X:2698889_T_C 2698889 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698923_G_A X:2698923_G_A 2698923 G A 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 +X:2698953_A_AGG X:2698953_A_AGG 2698953 A AGG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2698954_G_A X:2698954_G_A 2698954 G A 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 +X:2699002_C_A X:2699002_C_A 2699002 C A 0 0 0 0 ? ? 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699025_T_C X:2699025_T_C 2699025 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699091_G_A X:2699091_G_A 2699091 G A 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699187_T_C X:2699187_T_C 2699187 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0* 1* +X:2699188_G_C X:2699188_G_C 2699188 G C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699189_T_C X:2699189_T_C 2699189 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699217_C_T X:2699217_C_T 2699217 C T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699246_C_A X:2699246_C_A 2699246 C A 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 +X:2699275_T_G X:2699275_T_G 2699275 T G 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 +X:2699350_A_T X:2699350_A_T 2699350 A T 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699360_T_C X:2699360_T_C 2699360 T C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699450_A_C X:2699450_A_C 2699450 A C 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 +X:2699507_T_C X:2699507_T_C 2699507 T C 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +X:2699555_C_A X:2699555_C_A 2699555 C A 0 - 1 - 1 - 0 - 1 - 1 1 1 0 0 1 0 0 0 1 +X:2699645_G_T X:2699645_G_T 2699645 G T 0 - 1 - 0 - 0 - 0 - 0 0 0 0 0 1 0 0 0 0 +X:2699676_G_A X:2699676_G_A 2699676 G A 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X:2699728_C_T X:2699728_C_T 2699728 C T 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X:2699775_C_A X:2699775_C_A 2699775 C A 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 +X:2699898_C_CT X:2699898_C_CT 2699898 C CT 0 - 0 - 1 - 0 - 1 - 1 0 1 0 0 0 0 0 0 1 +X:2699968_A_G X:2699968_A_G 2699968 A G ? - 0 - 0 - 1 - 0 - 0 1 0 0 0 0 0 1 1 0 +X:2699970_T_C X:2699970_T_C 2699970 T C 0 - 0 - 0 - 0 - 0 - 0 0 0 0 0 0 0 0 0 0 diff --git a/vcfconvert.c b/vcfconvert.c index 3694a29da..80694b081 100644 --- a/vcfconvert.c +++ b/vcfconvert.c @@ -986,7 +986,7 @@ static void vcf_to_hapsample(args_t *args) if ( args->output_vcf_ids ) kputs("%CHROM %ID %POS %REF %FIRST_ALT ", &str); else - kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); + kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str); if ( args->hap2dip ) kputs("%_GT_TO_HAP2\n", &str); @@ -1003,7 +1003,7 @@ static void vcf_to_hapsample(args_t *args) if ( n_files==1 ) { int l = str.l; - kputs(".sample",&str); + kputs(".samples",&str); sample_fname = strdup(str.s); str.l = l; kputs(".hap.gz",&str); From 9b77faa06b56f2bc0b75546cc74fbe78aa026372 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 27 Jan 2021 13:55:23 +0000 Subject: [PATCH 42/81] Make +split-vep transcript selection work also on the raw CSQ/BCSQ annotation. Until now only CSQ subfields would be trimmed to include the worst (-s worst) or primary (-s primary) transcript, but CSQ remained unchanged with `-c CSQ` or `-f '%CSQ\n'`. --- plugins/split-vep.c | 238 ++++++++++++++++++++++++------------------ test/split-vep.21.out | 4 + test/split-vep.22.out | 4 + test/split-vep.8.vcf | 12 +++ test/test.pl | 2 + 5 files changed, 159 insertions(+), 101 deletions(-) create mode 100644 test/split-vep.21.out create mode 100644 test/split-vep.22.out create mode 100644 test/split-vep.8.vcf diff --git a/plugins/split-vep.c b/plugins/split-vep.c index acee389cb..bf8c850d4 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -114,6 +114,7 @@ typedef struct int niarr,miarr, nfarr,mfarr; col2type_t *column2type; int ncolumn2type; + int raw_vep_request; // raw VEP tag requested and will need subsetting } args_t; @@ -356,6 +357,26 @@ static const char *get_column_type(args_t *args, char *field) } return "String"; } +static int query_has_field(char *fmt, char *field, kstring_t *str) +{ + str->l = 0; + kputc('%',str); + kputs(field,str); + char end, *ptr = fmt; + while ( ptr ) + { + ptr = strstr(ptr,str->s); + if ( !ptr ) return 0; + end = ptr[str->l]; + if ( isalnum(end) || end=='_' || end=='.' ) + { + ptr++; + continue; + } + break; + } + return 1; +} static void init_data(args_t *args) { args->sr = bcf_sr_init(); @@ -405,9 +426,79 @@ static void init_data(args_t *args) khash_str2int_set(args->field2idx, args->field[i], i); } + // Severity scale + kstring_t str = {0,0,0}; + args->csq2severity = khash_str2int_init(); + int severity = 0; + if ( args->severity ) + { + kstring_t tmp = {0,0,0}; + htsFile *fp = hts_open(args->severity,"r"); + if ( !fp ) error("Cannot read %s\n", args->severity); + while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) + { + kputs(tmp.s, &str); + kputc('\n', &str); + } + free(tmp.s); + } + else + kputs(default_severity(),&str); + ep = str.s; + while ( *ep ) + { + if ( *ep=='#' ) + { + while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; } + if ( !*ep ) break; + ep++; + continue; + } + char *bp = ep; + while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; } + char tmp = *ep; + *ep = 0; + args->nscale++; + args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); + args->scale[args->nscale-1] = strdup(bp); + if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) ) + khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); + if ( !tmp ) break; + if ( tmp=='\n' ) severity++; + ep++; + while ( *ep && isspace(*ep) ) ep++; + } + + // Transcript and/or consequence selection + if ( !args->select ) args->select = "all:any"; + cols_t *cols = cols_split(args->select, NULL, ':'); + char *sel_tr = cols->off[0][0] ? cols->off[0] : "all"; + char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any"; + if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL; + else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST; + else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY; + else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr); + if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups + else + { + int len = strlen(sel_csq); + int severity, modifier = '='; + if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; } + else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; } + if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 ) + error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq); + if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; } + else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; } + else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; } + } + cols_destroy(cols); + + // The 'CANONICAL' column to look up severity, its name is hardwired for now + if ( args->select_tr==SELECT_TR_PRIMARY && khash_str2int_get(args->field2idx,"CANONICAL",&args->primary_id)!=0 ) + error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]); + // Create a text output as with `bcftools query -f`. For this we need to determine the fields to be extracted // from the formatting expression - kstring_t str = {0,0,0}; if ( args->format_str && !args->column_str ) { // Special case: -A was given, extract all fields, for this the -a tag (%CSQ) must be present @@ -415,41 +506,24 @@ static void init_data(args_t *args) for (i=0; infield; i++) { - str.l = 0; - kputc('%',&str); - kputs(args->field[i],&str); - char end, *ptr = args->format_str; - while ( ptr ) - { - ptr = strstr(ptr,str.s); - if ( !ptr ) break; - end = ptr[str.l]; - if ( isalnum(end) || end=='_' || end=='.' ) - { - ptr++; - continue; - } - break; - } - if ( !ptr ) continue; - ptr[str.l] = 0; - int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, ptr+1); + if ( !query_has_field(args->format_str,args->field[i],&str) ) continue; + + int tag_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->field[i]); if ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,tag_id) ) - fprintf(stderr,"Note: ambiguous key %s, using the %s subfield of %s, not the INFO/%s tag\n", ptr,ptr+1,args->vep_tag,ptr+1); + fprintf(stderr,"Note: ambiguous key %%%s; using the %s subfield of %s, not the INFO/%s tag\n", args->field[i],args->field[i],args->vep_tag,args->field[i]); int olen = args->column_str ? strlen(args->column_str) : 0; - int nlen = strlen(ptr) - 1; + int nlen = strlen(args->field[i]); args->column_str = (char*)realloc(args->column_str, olen + nlen + 2); if ( olen ) { memcpy(args->column_str+olen,",",1); olen++; } - memcpy(args->column_str+olen,ptr+1,nlen); + memcpy(args->column_str+olen,args->field[i],nlen); args->column_str[olen+nlen] = 0; - - ptr[str.l] = end; } + if ( query_has_field(args->format_str,args->vep_tag,&str) ) args->raw_vep_request = 1; } // The "Consequence" column to look up severity, its name is hardwired for now @@ -473,10 +547,20 @@ static void init_data(args_t *args) { char *tp, *bp = ep; while ( *ep && *ep!=',' ) ep++; - char tmp = *ep; + char keep = *ep; *ep = 0; int type = -1; int idx_beg, idx_end; + if ( !strcmp("-",bp) ) + { + kstring_t str = {0,0,0}; + ksprintf(&str,"0-%d",args->nfield-1); + if ( keep ) ksprintf(&str,",%s",ep+1); + free(args->column_str); + args->column_str = str.s; + ep = str.s; + continue; + } if ( khash_str2int_get(args->field2idx, bp, &idx_beg)==0 ) idx_end = idx_beg; else if ( (tp=strrchr(bp,':')) ) @@ -513,6 +597,13 @@ static void init_data(args_t *args) else if ( !strcasecmp(mp+1,"flag") ) type = BCF_HT_FLAG; else error("The type \"%s\" (or column \"%s\"?) not recognised\n", mp+1,bp); } + else if ( !strcmp(bp,args->vep_tag) ) + { + args->raw_vep_request = 1; + if ( !keep ) break; + ep++; + continue; + } else error("No such column: \"%s\"\n", bp); } @@ -529,7 +620,7 @@ static void init_data(args_t *args) types[i] = type; i++; } - if ( !tmp ) break; + if ( !keep ) break; ep++; } args->annot = (annot_t*)calloc(args->nannot,sizeof(*args->annot)); @@ -555,6 +646,18 @@ static void init_data(args_t *args) ksprintf(&args->kstr,"##INFO=",type); bcf_hdr_printf(args->hdr_out, args->kstr.s, ann->tag,ann->field,args->vep_tag); } + if ( args->raw_vep_request && args->select_tr==SELECT_TR_ALL ) args->raw_vep_request = 0; + if ( args->raw_vep_request ) + { + args->nannot++; + args->annot = (annot_t*)realloc(args->annot,args->nannot*sizeof(*args->annot)); + annot_t *ann = &args->annot[args->nannot-1]; + ann->type = BCF_HT_STR; + ann->idx = -1; + ann->field = strdup(args->vep_tag); + ann->tag = strdup(args->vep_tag); + memset(&ann->str,0,sizeof(ann->str)); + } free(column); free(types); destroy_column2type(args); @@ -579,77 +682,7 @@ static void init_data(args_t *args) convert_set_option(args->convert, subset_samples, &args->smpl_pass); } - // Severity scale - args->csq2severity = khash_str2int_init(); - int severity = 0; - str.l = 0; - if ( args->severity ) - { - kstring_t tmp = {0,0,0}; - htsFile *fp = hts_open(args->severity,"r"); - if ( !fp ) error("Cannot read %s\n", args->severity); - while ( hts_getline(fp, KS_SEP_LINE, &tmp) > 0 ) - { - kputs(tmp.s, &str); - kputc('\n', &str); - } - free(tmp.s); - } - else - kputs(default_severity(),&str); - ep = str.s; - while ( *ep ) - { - if ( *ep=='#' ) - { - while ( *ep && *ep!='\n' ) { *ep = tolower(*ep); ep++; } - if ( !*ep ) break; - ep++; - continue; - } - char *bp = ep; - while ( *ep && !isspace(*ep) ) { *ep = tolower(*ep); ep++; } - char tmp = *ep; - *ep = 0; - args->nscale++; - args->scale = (char**) realloc(args->scale,args->nscale*sizeof(*args->scale)); - args->scale[args->nscale-1] = strdup(bp); - if ( !khash_str2int_has_key(args->csq2severity,args->scale[args->nscale-1]) ) - khash_str2int_set(args->csq2severity,args->scale[args->nscale-1], severity); - if ( !tmp ) break; - if ( tmp=='\n' ) severity++; - ep++; - while ( *ep && isspace(*ep) ) ep++; - } free(str.s); - - // Transcript and/or consequence selection - if ( !args->select ) args->select = "all:any"; - cols_t *cols = cols_split(args->select, NULL, ':'); - char *sel_tr = cols->off[0][0] ? cols->off[0] : "all"; - char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any"; - if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL; - else if ( !strcasecmp(sel_tr,"worst") ) args->select_tr = SELECT_TR_WORST; - else if ( !strcasecmp(sel_tr,"primary") ) args->select_tr = SELECT_TR_PRIMARY; - else error("Error: the transcript selection key \"%s\" is not recognised.\n", sel_tr); - if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups - else - { - int len = strlen(sel_csq); - int severity, modifier = '='; - if ( sel_csq[len-1]=='+' ) { modifier = '+'; sel_csq[len-1] = 0; } - else if ( sel_csq[len-1]=='-' ) { modifier = '-'; sel_csq[len-1] = 0; } - if ( khash_str2int_get(args->csq2severity, sel_csq, &severity)!=0 ) - error("Error: the consequence \"%s\" is not recognised. Run \"bcftools +split-vep -S ?\" to see the default list.\n", sel_csq); - if ( modifier=='=' ) { args->min_severity = severity; args->max_severity = severity; } - else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; } - else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; } - } - cols_destroy(cols); - - // The 'CANONICAL' column to look up severity, its name is hardwired for now - if ( args->select_tr==SELECT_TR_PRIMARY && khash_str2int_get(args->field2idx,"CANONICAL",&args->primary_id)!=0 ) - error("The primary transcript was requested but the field \"CANONICAL\" is not present in INFO/%s: %s\n",args->vep_tag,hrec->vals[ret]); } static void destroy_data(args_t *args) { @@ -941,13 +974,16 @@ static void process_record(args_t *args, bcf1_t *rec) continue; } - if ( !*args->cols_csq->off[ann->idx] ) - annot_append(ann, "."); // missing value - else + char *ann_str = NULL; + if ( ann->idx==-1 ) ann_str = args->cols_tr->off[i]; + else if ( *args->cols_csq->off[ann->idx] ) ann_str = args->cols_csq->off[ann->idx]; + if ( ann_str ) { - annot_append(ann, args->cols_csq->off[ann->idx]); + annot_append(ann, ann_str); all_missing = 0; } + else + annot_append(ann, "."); // missing value } if ( args->duplicate ) diff --git a/test/split-vep.21.out b/test/split-vep.21.out new file mode 100644 index 000000000..6c831f7e0 --- /dev/null +++ b/test/split-vep.21.out @@ -0,0 +1,4 @@ +1 145075172 . G A . . BCSQ=intron|PDE4DIP||protein_coding;Consequence=intron;amino_acid_change=. +2 175216515 . A G . . BCSQ=intron|CIR1||protein_coding;Consequence=intron;amino_acid_change=. +20 37400393 . C T . . BCSQ=synonymous|ACTR5|ENST00000243903|protein_coding|+|586S|37400393C>T;Consequence=synonymous;amino_acid_change=586S +X 48822646 . T G . . BCSQ=missense|KCND1|ENST00000376477|protein_coding|-|135T>135P|48822646T>G;Consequence=missense;amino_acid_change=135T>135P diff --git a/test/split-vep.22.out b/test/split-vep.22.out new file mode 100644 index 000000000..dd2c348c0 --- /dev/null +++ b/test/split-vep.22.out @@ -0,0 +1,4 @@ +145075172 intron . intron|PDE4DIP||protein_coding +175216515 intron . intron|CIR1||protein_coding +37400393 synonymous 586S synonymous|ACTR5|ENST00000243903|protein_coding|+|586S|37400393C>T +48822646 missense 135T>135P missense|KCND1|ENST00000376477|protein_coding|-|135T>135P|48822646T>G diff --git a/test/split-vep.8.vcf b/test/split-vep.8.vcf new file mode 100644 index 000000000..9b24cdbad --- /dev/null +++ b/test/split-vep.8.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##contig= +##contig= +##contig= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 145075172 . G A . . BCSQ=intron|PDE4DIP||protein_coding,non_coding|PDE4DIP||retained_intron,non_coding|PDE4DIP||processed_transcript,intron|PDE4DIP||NMD +2 175216515 . A G . . BCSQ=intron|CIR1||protein_coding,non_coding|CIR1||retained_intron +20 37400393 . C T . . BCSQ=synonymous|ACTR5|ENST00000243903|protein_coding|+|586S|37400393C>T +X 48822646 . T G . . BCSQ=missense|KCND1|ENST00000376477|protein_coding|-|135T>135P|48822646T>G,missense|KCND1|ENST00000218176|protein_coding|-|512T>512P|48822646T>G,missense|KCND1|ENST00000419374|protein_coding|-|83T>83P|48822646T>G diff --git a/test/test.pl b/test/test.pl index caff67ef4..84f4e0bd2 100755 --- a/test/test.pl +++ b/test/test.pl @@ -525,6 +525,8 @@ test_vcf_plugin($opts,in=>'split-vep.6',out=>'split-vep.18.out',cmd=>'+split-vep',args=>qq[-c - | grep -v ^#]); test_vcf_plugin($opts,in=>'split-vep.6',out=>'split-vep.19.out',cmd=>'+split-vep',args=>qq[-c - -s worst | grep -v ^#]); test_vcf_plugin($opts,in=>'split-vep.7',out=>'split-vep.20.out',cmd=>'+split-vep',args=>qq[--annotation 'ANN' -c IMPACT -i 'INFO/IMPACT[*] ~ "MODIFIER"' | grep -v ^#]); +test_vcf_plugin($opts,in=>'split-vep.8',out=>'split-vep.21.out',cmd=>'+split-vep',args=>qq[-a BCSQ -s worst -c Consequence,amino_acid_change,BCSQ | grep -v ^#]); +test_vcf_plugin($opts,in=>'split-vep.8',out=>'split-vep.22.out',cmd=>'+split-vep',args=>qq[-a BCSQ -s worst -f '%POS\\t%Consequence\\t%amino_acid_change\\t%BCSQ\\n' | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.1.out',cmd=>'+parental-origin',args=>qq[-r 20:100 -p proband,father,mother -t del | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.2.out',cmd=>'+parental-origin',args=>qq[-r 20:101 -p proband,father,mother -t del | grep -v ^#]); test_vcf_plugin($opts,in=>'parental-origin',out=>'parental-origin.3.out',cmd=>'+parental-origin',args=>qq[-r 20:102 -p proband,father,mother -t del | grep -v ^#]); From 9174f820d4f93b5ecf219a55c78eb52d85f2af1e Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 27 Jan 2021 15:15:36 +0000 Subject: [PATCH 43/81] Add a new `consensus --mask-with` option. Resolves #1382 --- consensus.c | 142 +++++++++++++++++++++++++++++------------- doc/bcftools.txt | 10 ++- test/consensus.16.out | 20 ++++++ test/test.pl | 2 + 4 files changed, 126 insertions(+), 48 deletions(-) create mode 100644 test/consensus.16.out diff --git a/consensus.c b/consensus.c index d7c6de235..51c91d4e9 100644 --- a/consensus.c +++ b/consensus.c @@ -68,6 +68,17 @@ typedef struct } chain_t; +#define MASK_LC 1 +#define MASK_UC 2 +#define MASK_SKIP(x) (((x)->with!=MASK_LC && (x)->with!=MASK_UC) ? 1 : 0) +typedef struct +{ + char *fname, with; + regidx_t *idx; + regitr_t *itr; +} +mask_t; + typedef struct { kstring_t fa_buf; // buffered reference sequence @@ -88,8 +99,8 @@ typedef struct int nvcf_buf, rid; char *chr, *chr_prefix; - regidx_t *mask; - regitr_t *itr; + mask_t *mask; + int nmask; int chain_id; // chain_id, to provide a unique ID to each chain in the chain output chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences @@ -227,11 +238,13 @@ static void init_data(args_t *args) if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); args->isample = 0; } - if ( args->mask_fname ) + int i; + for (i=0; inmask; i++) { - args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL); - if ( !args->mask ) error("Failed to initialize mask regions\n"); - args->itr = regitr_init(args->mask); + mask_t *mask = &args->mask[i]; + mask->idx = regidx_init(mask->fname,NULL,NULL,0,NULL); + if ( !mask->idx ) error("Failed to initialize mask regions\n"); + mask->itr = regitr_init(mask->idx); } // In case we want to store the chains if ( args->chain_fname ) @@ -252,7 +265,23 @@ static void init_data(args_t *args) args->filter = filter_init(args->hdr, args->filter_str); args->rid = -1; } - +static void add_mask(args_t *args, char *fname) +{ + args->nmask++; + args->mask = (mask_t*)realloc(args->mask,args->nmask*sizeof(*args->mask)); + mask_t *mask = &args->mask[args->nmask-1]; + mask->fname = fname; + mask->with = 'N'; +} +static void add_mask_with(args_t *args, char *with) +{ + if ( !args->nmask ) error("The --mask-with option must follow --mask\n"); + mask_t *mask = &args->mask[args->nmask-1]; + if ( !strcasecmp(with,"uc") ) mask->with = MASK_UC; + else if ( !strcasecmp(with,"lc") ) mask->with = MASK_LC; + else if ( strlen(with)!=1 ) error("Expected \"lc\", \"uc\", or a single character with the --mask-with option\n"); + else mask->with = *with; +} static void destroy_data(args_t *args) { if (args->filter) filter_destroy(args->filter); @@ -263,8 +292,13 @@ static void destroy_data(args_t *args) free(args->vcf_buf); free(args->fa_buf.s); free(args->chr); - if ( args->mask ) regidx_destroy(args->mask); - if ( args->itr ) regitr_destroy(args->itr); + for (i=0; inmask; i++) + { + mask_t *mask = &args->mask[i]; + regidx_destroy(mask->idx); + regitr_destroy(mask->itr); + } + free(args->mask); if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); @@ -447,15 +481,20 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( args->absent_allele ) apply_absent(args, rec->pos); if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; } + int i; if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); int start = rec->pos; int end = rec->pos + rec->rlen - 1; - if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return; + for (i=0; inmask; i++) + { + mask_t *mask = &args->mask[i]; + if ( MASK_SKIP(mask) && regidx_overlap(mask->idx, chr,start,end,NULL) ) return; + } } - int i, ialt = 1; // the alternate allele + int ialt = 1; // the alternate allele if ( args->isample >= 0 ) { bcf_unpack(rec, BCF_UN_FMT); @@ -847,17 +886,27 @@ static void mask_region(args_t *args, char *seq, int len) { int start = args->fa_src_pos - len; int end = args->fa_src_pos; + int i; - if ( !regidx_overlap(args->mask, args->chr,start,end, args->itr) ) return; - - int idx_start, idx_end, i; - while ( regitr_overlap(args->itr) ) + for (i=0; inmask; i++) { - idx_start = args->itr->beg - start; - idx_end = args->itr->end - start; - if ( idx_start < 0 ) idx_start = 0; - if ( idx_end >= len ) idx_end = len - 1; - for (i=idx_start; i<=idx_end; i++) seq[i] = 'N'; + mask_t *mask = &args->mask[i]; + if ( !regidx_overlap(mask->idx, args->chr,start,end, mask->itr) ) continue; + + int idx_start, idx_end, j; + while ( regitr_overlap(mask->itr) ) + { + idx_start = mask->itr->beg - start; + idx_end = mask->itr->end - start; + if ( idx_start < 0 ) idx_start = 0; + if ( idx_end >= len ) idx_end = len - 1; + if ( mask->with==MASK_UC ) + for (j=idx_start; j<=idx_end; j++) seq[j] = toupper(seq[j]); + else if ( mask->with==MASK_LC ) + for (j=idx_start; j<=idx_end; j++) seq[j] = tolower(seq[j]); + else + for (j=idx_start; j<=idx_end; j++) seq[j] = mask->with; + } } } @@ -968,31 +1017,32 @@ static void usage(args_t *args) fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n"); - fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); + fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --chain FILE write a chain file for liftover\n"); - fprintf(stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n"); - fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n"); - fprintf(stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n"); - fprintf(stderr, " the codes are case-insensitive:\n"); - fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); - fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); - fprintf(stderr, " R: REF allele in het genotypes\n"); - fprintf(stderr, " A: ALT allele\n"); - fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); - fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); - fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); - fprintf(stderr, " --mark-ins uc|lc highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); - fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); - fprintf(stderr, " -m, --mask FILE replace regions with N\n"); - fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); - fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); - fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); - fprintf(stderr, " -s, --sample NAME apply variants of the given sample\n"); + fprintf(stderr, " -c, --chain FILE write a chain file for liftover\n"); + fprintf(stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n"); + fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n"); + fprintf(stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(stderr, " the codes are case-insensitive:\n"); + fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); + fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); + fprintf(stderr, " R: REF allele in het genotypes\n"); + fprintf(stderr, " A: ALT allele\n"); + fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); + fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); + fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); + fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); + fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); + fprintf(stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " -m, --mask FILE replace regions with N\n"); + fprintf(stderr, " --mask-with CHAR|lc|uc replace regions with N\n"); + fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); + fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); + fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); + fprintf(stderr, " -s, --sample NAME apply variants of the given sample\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(stderr, " # in the form \">chr:from-to\".\n"); @@ -1011,6 +1061,7 @@ int main_consensus(int argc, char *argv[]) {"mark-del",required_argument,NULL,1}, {"mark-ins",required_argument,NULL,2}, {"mark-snv",required_argument,NULL,3}, + {"mask-with",1,0,4}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, @@ -1048,7 +1099,8 @@ int main_consensus(int argc, char *argv[]) case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; - case 'm': args->mask_fname = optarg; break; + case 'm': add_mask(args,optarg); break; + case 4 : add_mask_with(args,optarg); break; case 'a': args->absent_allele = optarg[0]; if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg); diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 27798bde0..f1a022afe 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -854,16 +854,20 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the instead of removing sequence, insert CHAR for deletions *--mark-ins* 'uc'|'lc':: - highlight inserted sequence in upper (uc) or lower (lc) case, leaving the rest of the sequence as is + highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is *--mark-snv* 'uc'|'lc':: - highlight substitutions in upper (uc) or lower (lc) case, leaving the rest of the sequence as is + highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is *-m, --mask* 'FILE':: - BED file or TAB file with regions to be replaced with N. See discussion + BED file or TAB file with regions to be replaced with N (the default) or as specified by + the next *--mask-with* option. See discussion of *--regions-file* in *<>* for file format details. +*--mask-with* 'CHAR'|'lc'|'uc':: + replace sequence from *--mask* with CHAR or change to lowercase (lc) or uppercase (uc) + *-M, --missing* 'CHAR':: instead of skipping the missing genotypes, output the character CHAR (e.g. "?") diff --git a/test/consensus.16.out b/test/consensus.16.out new file mode 100644 index 000000000..1b0e1b4ba --- /dev/null +++ b/test/consensus.16.out @@ -0,0 +1,20 @@ +>1:2-501 +TACMAWATRTGATAAAATMAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTTG +MAGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCATT +AAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAAT +ATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCTC +TTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAAC +TTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAAG +GTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCTG +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +TTTAAAAACAAAAAAAAAGAA +>2 +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +AAATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATTTCA +GAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGACGTA +CGTTGTCGGGACAGCCTTTTTATAAAATAATGTTGAGGCTTTGATACGTCAAAGxxxxxx +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxTTTGCT +GCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGTTACA +TGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGACTCCT +CTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATTYCAGAC +ACAGTTAATCCAGAC diff --git a/test/test.pl b/test/test.pl index 84f4e0bd2..7c6b736a9 100755 --- a/test/test.pl +++ b/test/test.pl @@ -597,6 +597,7 @@ test_vcf_consensus($opts,in=>'consensus',out=>'consensus.2.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1'); test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.2.chain',chain=>'consensus.2.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-H 1'); test_vcf_consensus($opts,in=>'consensus',out=>'consensus.3.out',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-I'); +test_vcf_consensus($opts,in=>'consensus',out=>'consensus.16.out',fa=>'consensus.fa',args=>'-I -m {PATH}/consensus.tab --mask-with X -m {PATH}/consensus.tab --mask-with lc'); test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.3.chain',chain=>'consensus.3.chain',fa=>'consensus.fa',mask=>'consensus.tab',args=>'-I'); test_vcf_consensus($opts,in=>'consensus',out=>'consensus.4.out',fa=>'consensus.fa',args=>'-H 1'); test_vcf_consensus_chain($opts,in=>'consensus',out=>'consensus.4.chain',chain=>'consensus.4.chain',fa=>'consensus.fa',args=>'-H 1'); @@ -1466,6 +1467,7 @@ sub test_vcf_consensus { my ($opts,%args) = @_; bgzip_tabix_vcf($opts,$args{in}); + $args{args} =~ s/{PATH}/$$opts{path}/g; my $mask = $args{mask} ? "-m $$opts{path}/$args{mask}" : ''; my $chain = $args{chain} ? "-c $$opts{tmp}/$args{chain}" : ''; test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools consensus $$opts{tmp}/$args{in}.vcf.gz -f $$opts{path}/$args{fa} $args{args} $mask $chain"); From af547074b3f823ce324813d47aa9e793e41a4240 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 27 Jan 2021 16:18:36 +0000 Subject: [PATCH 44/81] Use hts_[sl]rand() functions instead of introducing PCG. Reverts and replaces 61bab7529f. --- Makefile | 8 +-- NEWS | 139 +++++++++++++++++++++++++++++++++++++++++++++ pcg.c | 116 ------------------------------------- pcg.h | 80 -------------------------- plugins/prune.c | 3 +- test/gtcheck.8.out | 8 +-- test/prune.1.7.out | 4 +- vcfbuf.c | 15 ++--- vcfbuf.h | 2 - vcfgtcheck.c | 7 ++- 10 files changed, 158 insertions(+), 224 deletions(-) delete mode 100644 pcg.c delete mode 100644 pcg.h diff --git a/Makefile b/Makefile index a1894b38b..86fab623f 100644 --- a/Makefile +++ b/Makefile @@ -42,8 +42,7 @@ OBJS = main.o vcfindex.o tabix.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o \ - ccall.o em.o prob1.o kmin.o \ - pcg.o + ccall.o em.o prob1.o kmin.o PLUGIN_OBJS = vcfplugin.o prefix = /usr/local @@ -240,7 +239,7 @@ vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reade vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(bcftools_h) vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h) vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h -vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(bcftools_h) extsort.h pcg.h +vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(bcftools_h) extsort.h vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h) vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h) @@ -280,8 +279,7 @@ bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_st version.o: version.h version.c hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h HMM.o: HMM.c $(htslib_hts_h) HMM.h -vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(bcftools_h) $(vcfbuf_h) rbuf.h pcg.h -pcg.o: pcg.c +vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(bcftools_h) $(vcfbuf_h) rbuf.h extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h) csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h diff --git a/NEWS b/NEWS index 2594a871f..a2b0eba64 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,144 @@ ## Release a.b +Changes affecting the whole of bcftools, or multiple commands: + +* The output file type (-O, --output-type) needs not to be specified anymore + and is determined from the output file name suffix. + +* Make F_MISSING in filtering expressions work for sites with multiple ALT alleles (#1343) + + +Changes affecting specific commands: + +* bcftools annotate: + + - New `--rename-annots` option to help fix broken VCFs (#1335) + + - New -C option allows to read a long list of options from a file to prevent very + long command lines. + + - New `append-missing` logic allows annotations to be added for each ALT allele in the + same order as they appear in the VCF. Note that this is not bullet proof. In order for + this to work: + + - the annotation file must have one line per ALT allele + + - fields must contain a single value as multiple values are appended as they are and + would break the correspondence between the alleles and values + +* bcftools concat: + + - Do not phase genotypes by mistake if they are not already phased with `-l` (#1346) + +* bcftools consensus: + + - New `--mask-with`, `--mark-del`, `--mark-ins`, `--mark-snv` (#1382, #1381, #1170) + + - Symbolic should have only one REF base. If there are multiple, take POS+1 + as the first deleted base. + + - Make consensus work when the first base of the reference genome is deleted. In this + situation the VCF record has POS=1 and the first REF base cannot precede the event. + (#1330) + +* bcftools +contrast: + + - The NOVELGT annotation was previously not added when requested. + +* bcftools convert: + + - Make the --hapsample and --hapsample2vcf options consistent with each other + and with the documentation. + +* bcftools call: + + - Revamp of `call -G`, previously sample grouping by population was not truly independent + and could still be influenced by the presence of other sample groups. + + - Explicit --group-samples-tag option instead of the --group-samples TAG:file + functionality (#1370) + + - Optional addition of INFO/PV4 annotation with `call -a INFO/PV4` + + - Remove generation of useless HOB and ICB annotation; use +fill-tags -- -t HWE,ExcHet` instead + + - The `call -f` option was renamed to `-a` to (1) make it consistent with `mpileup` and (2) to indicate + that it includes both INFO and FORMAT annotations, not just FORMAT as previously + + - Any sensible Number=R,Type=Integer annotation can be used with -G, such as AD or QS + + - Don't trim QUAL; although usefuleness of this change is questionable for true probabilistic + interpretation (such high precision is unrealistic), using QUAL as a score rather than probability + is helpful and permits more fine-grained filtering + + - Fix a suspected bug in `call -F` in the worst case, for certain improve readability + + - `call -C trio` is temporarily disabled + +* bcftools +fill-tags: + + - MAF definition revised for multiallelic sites, the second most common allele is considered to be the + minor allele (#1313) + +* bcftools gtchecK: + + - support matching of a single sample against all other samples in the file with + `-s qry:sample -s gt:-`. This was previously not possible, either full cross-check mode had to be run or + a list of pairs/samples had to be created explicitly + +* bcftools merge: + + - Make `merge -R` behavior consistent with other commands and pull in overlapping + records with POS outside of the regions (#1374) + + - Bug fix (#1353) + +* bcftools mpileup: + + - Add new optional tag `mpileup -a FORMAT/QS` + +* bcftools +prune: + + - New options --random-seed and --nsites-per-win-mode (#1050) + +* bcftools +split-vep: + + - Transcript selection now works also on the raw CSQ/BCSQ annotation. + + - Bug fix, samples were dropped on VCF input and VCF/BCF output (#1349) + +* bcftools stats: + + - Changes to QUAL and ts/tv plotting stats: avoid capping QUAL to + predefined bins, use an open-range logarithmic binning instead + + - plot dual ts/tv stats: per quality bin and cumulative as if threshold + applied on the whole dataset + +* bcftools +trio-dnm2: + + - Major revamp of +trio-dnm plugin, which is now deprecated and replaced with + +trio-dnm2. + + The original trio-dnm calling model used genotype likelihoods (PLs) as the + input for calling. However, that is flawed because PLs make assumptions which + are unsuitable for de novo calling: PL(RR) can become bigger than PL(RA) even + when the ALT allele is present in the parents. Note that this is true also + for other programs such as DeNovoGear which rely on the same samtools calculation. + + The new recommended workflow is + + bcftools mpileup -a AD,QS -f ref.fa -Ou proband.bam father.bam mother.bam | + bcftools call -mv -Ou | + bcftools +trio-dnm -p proband,father,mother -Oz -o output.vcf.gz + + This new version also implements the DeNovoGear model. The original behavior of trio-dnm + is no longer supported. + + For more details see http://samtools.github.io/bcftools/trio-dnm.pdf + + + ## Release 1.11 (22nd September 2020) diff --git a/pcg.c b/pcg.c deleted file mode 100644 index 89e00645c..000000000 --- a/pcg.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * PCG Random Number Generation for C. - * - * Copyright 2014 Melissa O'Neill - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * For additional information about the PCG random number generation scheme, - * including its license and other licensing options, visit - * - * http://www.pcg-random.org - */ - -/* - * This code is derived from the full C implementation, which is in turn - * derived from the canonical C++ PCG implementation. The C++ version - * has many additional features and is preferable if you can use C++ in - * your project. - */ - -#include "pcg.h" - -// state for global RNGs - -static pcg32_random_t pcg32_global = PCG32_INITIALIZER; - -// pcg32_srandom(initstate, initseq) -// pcg32_srandom_r(rng, initstate, initseq): -// Seed the rng. Specified in two parts, state initializer and a -// sequence selection constant (a.k.a. stream id) - -void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, uint64_t initseq) -{ - rng->state = 0U; - rng->inc = (initseq << 1u) | 1u; - pcg32_random_r(rng); - rng->state += initstate; - pcg32_random_r(rng); -} - -void pcg32_srandom(uint64_t seed, uint64_t seq) -{ - pcg32_srandom_r(&pcg32_global, seed, seq); -} - -// pcg32_random() -// pcg32_random_r(rng) -// Generate a uniformly distributed 32-bit random number - -uint32_t pcg32_random_r(pcg32_random_t* rng) -{ - uint64_t oldstate = rng->state; - rng->state = oldstate * 6364136223846793005ULL + rng->inc; - uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; - uint32_t rot = oldstate >> 59u; - return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); -} - -uint32_t pcg32_random() -{ - return pcg32_random_r(&pcg32_global); -} - - -// pcg32_boundedrand(bound): -// pcg32_boundedrand_r(rng, bound): -// Generate a uniformly distributed number, r, where 0 <= r < bound - -uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound) -{ - // To avoid bias, we need to make the range of the RNG a multiple of - // bound, which we do by dropping output less than a threshold. - // A naive scheme to calculate the threshold would be to do - // - // uint32_t threshold = 0x100000000ull % bound; - // - // but 64-bit div/mod is slower than 32-bit div/mod (especially on - // 32-bit platforms). In essence, we do - // - // uint32_t threshold = (0x100000000ull-bound) % bound; - // - // because this version will calculate the same modulus, but the LHS - // value is less than 2^32. - - uint32_t threshold = -bound % bound; - - // Uniformity guarantees that this loop will terminate. In practice, it - // should usually terminate quickly; on average (assuming all bounds are - // equally likely), 82.25% of the time, we can expect it to require just - // one iteration. In the worst case, someone passes a bound of 2^31 + 1 - // (i.e., 2147483649), which invalidates almost 50% of the range. In - // practice, bounds are typically small and only a tiny amount of the range - // is eliminated. - for (;;) { - uint32_t r = pcg32_random_r(rng); - if (r >= threshold) - return r % bound; - } -} - - -uint32_t pcg32_boundedrand(uint32_t bound) -{ - return pcg32_boundedrand_r(&pcg32_global, bound); -} - diff --git a/pcg.h b/pcg.h deleted file mode 100644 index db13d8cad..000000000 --- a/pcg.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * PCG Random Number Generation for C. - * - * Copyright 2014 Melissa O'Neill - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * For additional information about the PCG random number generation scheme, - * including its license and other licensing options, visit - * - * http://www.pcg-random.org - */ - -/* - * This code is derived from the full C implementation, which is in turn - * derived from the canonical C++ PCG implementation. The C++ version - * has many additional features and is preferable if you can use C++ in - * your project. - */ - -#ifndef PCG_BASIC_H_INCLUDED -#define PCG_BASIC_H_INCLUDED 1 - -#include - -#if __cplusplus -extern "C" { -#endif - -struct pcg_state_setseq_64 { // Internals are *Private*. - uint64_t state; // RNG state. All values are possible. - uint64_t inc; // Controls which RNG sequence (stream) is - // selected. Must *always* be odd. -}; -typedef struct pcg_state_setseq_64 pcg32_random_t; - -#define PCG32_RAND_MAX UINT32_MAX - -// If you *must* statically initialize it, here's one. - -#define PCG32_INITIALIZER { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL } - -// pcg32_srandom(initstate, initseq) -// pcg32_srandom_r(rng, initstate, initseq): -// Seed the rng. Specified in two parts, state initializer and a -// sequence selection constant (a.k.a. stream id) - -void pcg32_srandom(uint64_t initstate, uint64_t initseq); -void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, - uint64_t initseq); - -// pcg32_random() -// pcg32_random_r(rng) -// Generate a uniformly distributed 32-bit random number - -uint32_t pcg32_random(void); -uint32_t pcg32_random_r(pcg32_random_t* rng); - -// pcg32_boundedrand(bound): -// pcg32_boundedrand_r(rng, bound): -// Generate a uniformly distributed number, r, where 0 <= r < bound - -uint32_t pcg32_boundedrand(uint32_t bound); -uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound); - -#if __cplusplus -} -#endif - -#endif // PCG_BASIC_H_INCLUDED diff --git a/plugins/prune.c b/plugins/prune.c index de4f63cd8..d23550082 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include "bcftools.h" @@ -191,7 +192,7 @@ static void init_data(args_t *args) if ( args->rand_missing || (args->nsites_mode && !strcasecmp(args->nsites_mode,"rand")) ) { fprintf(stderr,"Using random seed: %d\n",args->rseed); - vcfbuf_set_opt(args->vcfbuf,double,RANDOM_SEED,args->rseed); + hts_srand48(args->rseed); } if ( args->rand_missing ) vcfbuf_set_opt(args->vcfbuf,int,LD_RAND_MISSING,1); if ( args->nsites ) diff --git a/test/gtcheck.8.out b/test/gtcheck.8.out index d7dc506cb..ef120c68d 100644 --- a/test/gtcheck.8.out +++ b/test/gtcheck.8.out @@ -1,7 +1,7 @@ DC A B 3 1.734223e+01 9 DC C D 6 1.075056e+01 9 DC E F 9 0.000000e+00 9 -DS 2 3 3 0 -DS 3 3 3 1 -DS 1 3 3 2 -DS 1 2 2 3 +DS 1 3 3 0 +DS 2 3 3 1 +DS 3 3 3 2 +DS 3 2 2 3 diff --git a/test/prune.1.7.out b/test/prune.1.7.out index 4e3841842..b8d53abf4 100644 --- a/test/prune.1.7.out +++ b/test/prune.1.7.out @@ -6,6 +6,6 @@ ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3 1 102 . T A . . AF=0.2 GT 0/1 0/1 0/1 -1 103 . T A . . AF=0.1 GT 0/1 0/0 0/0 -1 106 . T A . . AF=0.1 GT 0/1 1/1 1/1 +1 104 . T A . . AF=0.3 GT 0/0 0/0 0/0 +1 105 . T A . . AF=0.2 GT 0/0 0/0 0/0 1 108 . T A . . AF=0.2 GT 0/1 1/1 0/1 diff --git a/vcfbuf.c b/vcfbuf.c index c0e720a83..982c658a8 100644 --- a/vcfbuf.c +++ b/vcfbuf.c @@ -27,9 +27,9 @@ #include #include #include +#include #include "bcftools.h" #include "vcfbuf.h" -#include "pcg.h" #include "rbuf.h" typedef struct @@ -82,7 +82,6 @@ struct _vcfbuf_t prune_t prune; overlap_t overlap; rmdup_t rmdup; - pcg32_random_t rng; }; vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win) @@ -118,12 +117,6 @@ void vcfbuf_set(vcfbuf_t *buf, vcfbuf_opt_t key, void *value) if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; } if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; } - if ( key==RANDOM_SEED ) - { - uint64_t seed = *((uint64_t*)value); - pcg32_srandom_r(&buf->rng, seed, seed); - } - if ( key==VCFBUF_NSITES ) { buf->prune.max_sites = *((int*)value); @@ -215,7 +208,7 @@ static void _prune_sites(vcfbuf_t *buf, int flush_all) int eoff = flush_all ? 0 : 1; for (i=0; irbuf.n - eoff) * (double)pcg32_random_r(&buf->rng) / PCG32_RAND_MAX; + int j = (buf->rbuf.n - eoff) * hts_drand48(); rbuf_remove_kth(&buf->rbuf, vcfrec_t, j, buf->vcf); } return; @@ -442,7 +435,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l if ( aptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( (double)pcg32_random_r(&buf->rng)/PCG32_RAND_MAX >= aaf ) adsg += 1; + if ( hts_drand48() >= aaf ) adsg += 1; } else if ( bcf_gt_allele(aptr[j]) ) adsg += 1; an++; @@ -453,7 +446,7 @@ static int _calc_r2_ld(vcfbuf_t *buf, bcf1_t *arec, bcf1_t *brec, vcfbuf_ld_t *l if ( bptr[j]==bcf_gt_missing ) { if ( !buf->ld.rand_missing ) break; - if ( (double)pcg32_random_r(&buf->rng)/PCG32_RAND_MAX >= baf ) bdsg += 1; + if ( hts_drand48() >= baf ) bdsg += 1; } else if ( bcf_gt_allele(bptr[j]) ) bdsg += 1; bn++; diff --git a/vcfbuf.h b/vcfbuf.h index c011d03d8..d3be6c53c 100644 --- a/vcfbuf.h +++ b/vcfbuf.h @@ -44,8 +44,6 @@ typedef enum VCFBUF_NSITES_MODE, // one of: maxAF (keep sites with max AF), 1st (sites that come first), rand (pick randomly) VCFBUF_AF_TAG, // use this INFO tag with VCFBUF_NSITES - RANDOM_SEED, // initialize random seed generator used in LD_RAND_MISSING and VCFBUF_NSITES_MODE=rand - // LD related options LD_RAND_MISSING, // randomize rather than ignore missing genotypes LD_FILTER1, // exclude the next record inserted by vcfbuf_push() from LD analysis diff --git a/vcfgtcheck.c b/vcfgtcheck.c index 099b4aea4..0e9aebcd9 100644 --- a/vcfgtcheck.c +++ b/vcfgtcheck.c @@ -38,11 +38,11 @@ THE SOFTWARE. */ #include #include #include +#include #include #include #include "bcftools.h" #include "extsort.h" -#include "pcg.h" //#include "hclust.h" typedef struct @@ -181,13 +181,12 @@ static inline void diff_sites_reset(args_t *args) } static inline void diff_sites_push(args_t *args, int ndiff, int rid, int pos) { - static pcg32_random_t rng = PCG32_INITIALIZER; diff_sites_t *dat = (diff_sites_t*) malloc(args->diff_sites_size); memset(dat,0,sizeof(*dat)); // for debugging: prevent warnings about uninitialized memory coming from struct padding (not needed after rand added) dat->ndiff = ndiff; dat->rid = rid; dat->pos = pos; - dat->rand = pcg32_random_r(&rng); + dat->rand = hts_lrand48(); memcpy(dat->kbs_dat,args->kbs_diff->b,args->kbs_diff->n*sizeof(unsigned long)); extsort_push(args->es,dat); } @@ -233,6 +232,8 @@ static void init_samples(char *list, int list_is_file, int **smpl, int *nsmpl, b static void init_data(args_t *args) { + hts_srand48(0); + args->files = bcf_sr_init(); if ( args->regions && bcf_sr_set_regions(args->files, args->regions, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions); if ( args->targets && bcf_sr_set_targets(args->files, args->targets, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets); From 6e58d535b921eaac06984c8fa8b3b23203373e37 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 2 Feb 2021 08:40:38 +0000 Subject: [PATCH 45/81] Throw an error if -i/-e expression is given multiple times. Prevents errors #1388 --- consensus.c | 8 ++++++-- csq.c | 8 ++++++-- plugins/contrast.c | 8 ++++++-- plugins/fill-from-fasta.c | 10 +++++++--- plugins/guess-ploidy.c | 10 +++++++--- plugins/gvcfz.c | 8 ++++++-- plugins/indel-stats.c | 10 +++++++--- plugins/parental-origin.c | 10 +++++++--- plugins/prune.c | 8 ++++++-- plugins/remove-overlaps.c | 8 ++++++-- plugins/scatter.c | 8 ++++++-- plugins/setGT.c | 10 +++++++--- plugins/smpl-stats.c | 10 +++++++--- plugins/split-vep.c | 8 ++++++-- plugins/split.c | 8 ++++++-- plugins/trio-dnm2.c | 8 ++++++-- plugins/trio-stats.c | 10 +++++++--- vcfannotate.c | 8 ++++++-- vcfconvert.c | 8 ++++++-- vcffilter.c | 8 ++++++-- vcfplugin.c | 8 ++++++-- vcfquery.c | 10 +++++++--- vcfroh.c | 10 +++++++--- vcfstats.c | 10 +++++++--- vcfview.c | 11 +++++++---- 25 files changed, 161 insertions(+), 62 deletions(-) diff --git a/consensus.c b/consensus.c index 51c91d4e9..bcd49fae8 100644 --- a/consensus.c +++ b/consensus.c @@ -1096,8 +1096,12 @@ int main_consensus(int argc, char *argv[]) case 's': args->sample = optarg; break; case 'o': args->output_fname = optarg; break; case 'I': args->output_iupac = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'f': args->ref_fname = optarg; break; case 'm': add_mask(args,optarg); break; case 4 : add_mask_with(args,optarg); break; diff --git a/csq.c b/csq.c index f1b3aa4a9..1da646539 100644 --- a/csq.c +++ b/csq.c @@ -4178,8 +4178,12 @@ int main_csq(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); } break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': regions_list = optarg; break; case 'R': regions_list = optarg; regions_is_file = 1; break; case 's': args->sample_list = optarg; break; diff --git a/plugins/contrast.c b/plugins/contrast.c index 81f914437..c8fce64ad 100644 --- a/plugins/contrast.c +++ b/plugins/contrast.c @@ -462,8 +462,12 @@ int run(int argc, char **argv) case 'a': args->annots_str = optarg; break; case '0': args->control_samples_str = optarg; break; case '1': args->case_samples_str = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'r': args->regions = optarg; break; diff --git a/plugins/fill-from-fasta.c b/plugins/fill-from-fasta.c index d5fe1be2b..fdf4ff054 100644 --- a/plugins/fill-from-fasta.c +++ b/plugins/fill-from-fasta.c @@ -1,6 +1,6 @@ /* plugin/fill-from-fasta.c -- fill-from-fasta plugin. - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Shane McCarthy @@ -108,8 +108,12 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { switch (c) { - case 'e': filter_str = optarg; filter_logic |= FLT_EXCLUDE; break; - case 'i': filter_str = optarg; filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + filter_str = optarg; filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + filter_str = optarg; filter_logic |= FLT_INCLUDE; break; case 'N': replace_nonACGTN = 1; break; case 'c': column = optarg; break; case 'f': ref_fname = optarg; break; diff --git a/plugins/guess-ploidy.c b/plugins/guess-ploidy.c index cfcb604fe..8f9cf7d06 100644 --- a/plugins/guess-ploidy.c +++ b/plugins/guess-ploidy.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -429,8 +429,12 @@ int run(int argc, char **argv) args->af_dflt = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; - case 2: args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 3: args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 2: + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 3: + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'i': args->include_indels = 1; break; case 'e': args->gt_err_prob = strtod(optarg,&tmp); diff --git a/plugins/gvcfz.c b/plugins/gvcfz.c index 07d826c23..ccacb8a10 100644 --- a/plugins/gvcfz.c +++ b/plugins/gvcfz.c @@ -330,8 +330,12 @@ int run(int argc, char **argv) switch (c) { case 'a': args->trim_alts = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'g': args->group_by = optarg; break; case 'o': args->output_fname = optarg; break; case 'O': diff --git a/plugins/indel-stats.c b/plugins/indel-stats.c index fd0a009ed..256190963 100644 --- a/plugins/indel-stats.c +++ b/plugins/indel-stats.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018 Genome Research Ltd. + Copyright (c) 2018-2021 Genome Research Ltd. Author: Petr Danecek @@ -716,8 +716,12 @@ int run(int argc, char **argv) case 3 : args->allow_alt2ref_DNMs = 1; break; case 'p': args->ped_fname = optarg; break; case 'c': args->csq_tag = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'r': args->regions = optarg; break; diff --git a/plugins/parental-origin.c b/plugins/parental-origin.c index 7bbc02b82..e1271d4ec 100644 --- a/plugins/parental-origin.c +++ b/plugins/parental-origin.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2019 Genome Research Ltd. + Copyright (c) 2019-2021 Genome Research Ltd. Author: Petr Danecek @@ -352,8 +352,12 @@ int run(int argc, char **argv) { switch (c) { - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 't': if ( !strcasecmp("dup",optarg) ) args->cnv_type = CNV_DUP; else if ( !strcasecmp("del",optarg) ) args->cnv_type = CNV_DEL; diff --git a/plugins/prune.c b/plugins/prune.c index d23550082..1a0b4ceb1 100644 --- a/plugins/prune.c +++ b/plugins/prune.c @@ -315,8 +315,12 @@ int run(int argc, char **argv) if ( tmp==optarg || *tmp ) error("Could not parse: --random-seed %s\n", optarg); break; case 'k': args->keep_sites = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'a': { int n, i; diff --git a/plugins/remove-overlaps.c b/plugins/remove-overlaps.c index 034bd49f2..6fe00f632 100644 --- a/plugins/remove-overlaps.c +++ b/plugins/remove-overlaps.c @@ -175,8 +175,12 @@ int run(int argc, char **argv) case 'd': args->rmdup = 1; break; case 'p': args->print_overlaps = 1; break; case 'v': args->verbose = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'T': args->target_is_file = 1; // fall-through case 't': args->target = optarg; break; case 'R': args->region_is_file = 1; // fall-through diff --git a/plugins/scatter.c b/plugins/scatter.c index 7ce524fef..b45e7ee84 100644 --- a/plugins/scatter.c +++ b/plugins/scatter.c @@ -327,8 +327,12 @@ int run(int argc, char **argv) { switch (c) { - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 1 : args->record_cmd_line = 0; break; case 'o': args->output_dir = optarg; break; case 'O': diff --git a/plugins/setGT.c b/plugins/setGT.c index 0e62fa60c..dc2c6f326 100644 --- a/plugins/setGT.c +++ b/plugins/setGT.c @@ -1,6 +1,6 @@ /* plugins/setGT.c -- set gentoypes to given values - Copyright (C) 2015-2017 Genome Research Ltd. + Copyright (C) 2015-2021 Genome Research Ltd. Author: Petr Danecek @@ -193,8 +193,12 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { switch (c) { - case 'i': args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break; - case 'e': args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'n': args->new_mask = 0; if ( strchr(optarg,'.') ) args->new_mask |= GT_MISSING; if ( strchr(optarg,'0') ) args->new_mask |= GT_REF; diff --git a/plugins/smpl-stats.c b/plugins/smpl-stats.c index d82ec5cdc..44257bfa9 100644 --- a/plugins/smpl-stats.c +++ b/plugins/smpl-stats.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018 Genome Research Ltd. + Copyright (c) 2018-2021 Genome Research Ltd. Author: Petr Danecek @@ -448,8 +448,12 @@ int run(int argc, char **argv) { switch (c) { - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'r': args->regions = optarg; break; diff --git a/plugins/split-vep.c b/plugins/split-vep.c index bf8c850d4..4f9a6b148 100644 --- a/plugins/split-vep.c +++ b/plugins/split-vep.c @@ -1052,8 +1052,12 @@ int run(int argc, char **argv) case 'S': args->severity = optarg; break; case 's': args->select = optarg; break; case 'l': args->list_hdr = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'r': args->regions = optarg; break; diff --git a/plugins/split.c b/plugins/split.c index f2a5c54bc..e969d46cb 100644 --- a/plugins/split.c +++ b/plugins/split.c @@ -602,8 +602,12 @@ int run(int argc, char **argv) { case 1 : args->hts_opts = hts_readlist(optarg,0,&args->nhts_opts); break; case 'k': args->keep_tags = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'T': args->target = optarg; args->target_is_file = 1; break; case 't': args->target = optarg; break; case 'R': args->region = optarg; args->region_is_file = 1; break; diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 8b47855bd..38f80ce15 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -1161,8 +1161,12 @@ int run(int argc, char **argv) case 'X': args->chrX_list_str = optarg; break; case 'u': set_option(args,optarg); case 1 : args->force_ad = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'r': args->regions = optarg; break; diff --git a/plugins/trio-stats.c b/plugins/trio-stats.c index a15757fc8..7d1b522f2 100644 --- a/plugins/trio-stats.c +++ b/plugins/trio-stats.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2018-2020 Genome Research Ltd. + Copyright (c) 2018-2021 Genome Research Ltd. Author: Petr Danecek @@ -745,8 +745,12 @@ int run(int argc, char **argv) break; } case 'a': args->max_alt_trios = atoi(optarg); break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 't': args->targets = optarg; break; case 'T': args->targets = optarg; args->targets_is_file = 1; break; case 'r': args->regions = optarg; break; diff --git a/vcfannotate.c b/vcfannotate.c index 6ebe8cdb4..1c397c097 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -3089,8 +3089,12 @@ int main_vcfannotate(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); }; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'x': args->remove_annots = optarg; break; case 'a': args->targets_fname = optarg; break; case 'r': args->regions_list = optarg; break; diff --git a/vcfconvert.c b/vcfconvert.c index 80694b081..a48e85cfd 100644 --- a/vcfconvert.c +++ b/vcfconvert.c @@ -1488,8 +1488,12 @@ int main_vcfconvert(int argc, char *argv[]) }; while ((c = getopt_long(argc, argv, "?h:r:R:s:S:t:T:i:e:g:G:o:O:c:f:H:",loptions,NULL)) >= 0) { switch (c) { - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; args->regions_is_file = 1; break; case 't': args->targets_list = optarg; break; diff --git a/vcffilter.c b/vcffilter.c index 1ac555564..723bcdf14 100644 --- a/vcffilter.c +++ b/vcffilter.c @@ -510,8 +510,12 @@ int main_vcffilter(int argc, char *argv[]) case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'S': if ( !strcmp(".",optarg) ) args->set_gts = SET_GTS_MISSING; else if ( !strcmp("0",optarg) ) args->set_gts = SET_GTS_REF; diff --git a/vcfplugin.c b/vcfplugin.c index 74a96e08b..c4ea52d61 100644 --- a/vcfplugin.c +++ b/vcfplugin.c @@ -663,8 +663,12 @@ int main_plugin(int argc, char *argv[]) default: error("The output type \"%s\" not recognised\n", optarg); }; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; diff --git a/vcfquery.c b/vcfquery.c index cabaf481d..6568c8208 100644 --- a/vcfquery.c +++ b/vcfquery.c @@ -1,6 +1,6 @@ /* vcfquery.c -- Extracts fields from VCF/BCF file. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -297,8 +297,12 @@ int main_vcfquery(int argc, char *argv[]) args->format_str = str.s; break; } - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; diff --git a/vcfroh.c b/vcfroh.c index f3ca9f544..15b84e66e 100644 --- a/vcfroh.c +++ b/vcfroh.c @@ -1,6 +1,6 @@ /* vcfroh.c -- HMM model for detecting runs of autozygosity. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -1157,8 +1157,12 @@ int main_vcfroh(int argc, char *argv[]) args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; - case 3: args->filter_str = optarg; args->filter_logic = FLT_INCLUDE; break; - case 4: args->filter_str = optarg; args->filter_logic = FLT_EXCLUDE; break; + case 3 : + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 4 : + if ( args->filter_str ) error("Error: only one --include or --exclude expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 5: args->include_noalt_sites = 1; break; case 'o': args->output_fname = optarg; break; case 'O': diff --git a/vcfstats.c b/vcfstats.c index 3cd551176..601c557d9 100644 --- a/vcfstats.c +++ b/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -1838,8 +1838,12 @@ int main_vcfstats(int argc, char *argv[]) case 's': args->samples_list = optarg; break; case 'S': args->samples_list = optarg; args->samples_is_file = 1; break; case 'I': args->split_by_id = 1; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 'h': case '?': usage(); break; diff --git a/vcfview.c b/vcfview.c index 0e384badd..173865f77 100644 --- a/vcfview.c +++ b/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Shane McCarthy @@ -640,9 +640,12 @@ int main_vcfview(int argc, char *argv[]) break; case 'v': args->include_types = optarg; break; case 'V': args->exclude_types = optarg; break; - case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; - case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; - + case 'e': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; + case 'i': + if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n"); + args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'c': { args->min_ac_type = ALLELE_NONREF; From 89d61e6421f6e1cb0b00f4acd1342c2d1bc7fa00 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 2 Feb 2021 11:34:57 +0000 Subject: [PATCH 46/81] Support for IUPAC codes in insertions and MNVs, plus a new -H I option. Resolves #1241,#1393 --- bcftools.h | 8 ++++++++ consensus.c | 31 +++++++++++++++++++++---------- doc/bcftools.txt | 3 +++ test/consensus.15.fa | 22 ++-------------------- test/consensus.15.vcf | 8 ++++++++ test/consensus.16.out | 2 +- test/consensus.17.out | 2 ++ test/consensus.3.out | 2 +- test/test.pl | 1 + 9 files changed, 47 insertions(+), 32 deletions(-) create mode 100644 test/consensus.15.vcf create mode 100644 test/consensus.17.out diff --git a/bcftools.h b/bcftools.h index 08e2a6768..fc5d07076 100644 --- a/bcftools.h +++ b/bcftools.h @@ -54,6 +54,14 @@ const char *hts_bcf_wmode2(int file_type, char *fname); void *smalloc(size_t size); // safe malloc +static inline int is_acgtn(char nt) +{ + if ( nt < 65 ) return 0; + if ( nt > 84 ) nt -= 32; // to uppercase + if ( nt=='A' || nt=='C' || nt=='G' || nt=='T' || nt=='N' ) return 1; + return 0; +} + static inline char gt2iupac(char a, char b) { static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} }; diff --git a/consensus.c b/consensus.c index bcd49fae8..d06ebd90a 100644 --- a/consensus.c +++ b/consensus.c @@ -570,15 +570,18 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else jalt = ialt; - if ( ialt>=0 ) + if ( ialt==0 && jalt>0 ) ialt = jalt, jalt = 0; + if ( ialt>0 && ialt!=jalt ) { if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp? + i = 0; + while ( rec->d.allele[ialt][i] && rec->d.allele[jalt][i] ) { - char ial = rec->d.allele[ialt][0]; - char jal = rec->d.allele[jalt][0]; - if ( !ialt ) ialt = jalt; // only ialt is used, make sure 0/1 is not ignored - rec->d.allele[ialt][0] = gt2iupac(ial,jal); + char ial = rec->d.allele[ialt][i]; + char jal = rec->d.allele[jalt][i]; + if ( !is_acgtn(ial) || !is_acgtn(jal) ) break; + rec->d.allele[ialt][i] = gt2iupac(ial,jal); + i++; } } } @@ -633,11 +636,17 @@ static void apply_variant(args_t *args, bcf1_t *rec) } if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } - else if ( args->output_iupac && rec->n_allele>1 && !rec->d.allele[0][1] && !rec->d.allele[1][1] ) + else if ( args->output_iupac && ialt>0 ) { - char ial = rec->d.allele[0][0]; - char jal = rec->d.allele[1][0]; - rec->d.allele[1][0] = gt2iupac(ial,jal); + i = 0; + while ( rec->d.allele[ialt][i] && rec->d.allele[0][i] ) + { + char ial = rec->d.allele[ialt][i]; + char jal = rec->d.allele[0][i]; + if ( !is_acgtn(ial) || !is_acgtn(jal) ) break; + rec->d.allele[ialt][i] = gt2iupac(ial,jal); + i++; + } } if ( rec->n_allele==1 && ialt!=-1 ) @@ -1029,6 +1038,7 @@ static void usage(args_t *args) fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); fprintf(stderr, " R: REF allele in het genotypes\n"); fprintf(stderr, " A: ALT allele\n"); + fprintf(stderr, " I: IUPAC code for all genotypes\n"); fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); @@ -1123,6 +1133,7 @@ int main_consensus(int argc, char *argv[]) else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT; else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF; else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT; + else if ( !strcasecmp(optarg,"I") ) args->allele |= PICK_IUPAC; else if ( !strcasecmp(optarg,"1pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 1; else if ( !strcasecmp(optarg,"2pIu") ) args->allele |= PICK_IUPAC, args->haplotype = 2; else diff --git a/doc/bcftools.txt b/doc/bcftools.txt index f1a022afe..ace7afc72 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -832,6 +832,9 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the 'A';; the ALT allele (in heterozygous genotypes) + 'I';; + IUPAC code for all genotypes + 'LR, LA';; the longer allele. If both have the same length, use the REF allele (LR), or the ALT allele (LA) diff --git a/test/consensus.15.fa b/test/consensus.15.fa index d9a9619d0..78de2ed5f 100644 --- a/test/consensus.15.fa +++ b/test/consensus.15.fa @@ -1,20 +1,2 @@ ->1:2-501 -taccatatgtgacatataaaaaagaacataacctacgtatcaactaaagtggttgtttg -cagaaaaggaagacttaaaaagagtcagtactaacctacataatatatacaatgttcatt -aaataataaaatgagctcatcatacttaggtcatcataaatatatctgaaattcacaaat -attgatcaaatggtaaaatagacaagtagattttaataggttaaacaattactgattctc -ttgaaagaataaatttaatatgagacctatttcattataatgaactcacaaattagaaac -ttcacactgggggctggagagatggctcagtagttaagaacactgactgctcttctgaag -gtcctgagttcaaatcccagcaaccacatggtgacttacaaccatctgtaatgacatctg -atgccctctggtgtgtctgaagacagctacagtgtacttacataaaataataaataaatc -tttaaaaacaaaaaaaaagaa ->2 -gaagatcttttccttattaaggatctgaagctctgtagatttgtattctattaaacatgg -agagattagtgattttccatattctttaagtcattttagagtaatgtgttcttaagataa -atcagaaaaacaaaaacttgtgctttcctgtttgaaaaacaaacagctgtggggaatggt -gtcgggacagcctttttataaaatttttctaaataatgttgaggctttgatacgtcaaag -ttatatttcaaatggaatcacttagacctcgtttctgagtgtcaatggccatattgggga -tttgctgctgccaatgacagcacaccctgggaatgccccaactacttactacaaagcagt -gttacatggagaagatcttcaagagtctttttgctagatctttccttggcttttgatgtg -actcctctcaataaaatccacagtaatatagtgagtggtctcctgctccaaaccagtatt -tcagacacagttaatccagac +>ref +ACGTACGT diff --git a/test/consensus.15.vcf b/test/consensus.15.vcf new file mode 100644 index 000000000..1625b2c8f --- /dev/null +++ b/test/consensus.15.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##reference=file://some/path/human_g1k_v37.fasta +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +ref 3 . GT CC . PASS . GT 0/1 +ref 5 . A C,CT . PASS . GT 0/2 +ref 7 . G GC,GT . PASS . GT 1/2 diff --git a/test/consensus.16.out b/test/consensus.16.out index 1b0e1b4ba..3921c98b1 100644 --- a/test/consensus.16.out +++ b/test/consensus.16.out @@ -10,7 +10,7 @@ xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx TTTAAAAACAAAAAAAAAGAA >2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -AAATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATTTCA +ARATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATWTCA GAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGACGTA CGTTGTCGGGACAGCCTTTTTATAAAATAATGTTGAGGCTTTGATACGTCAAAGxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxTTTGCT diff --git a/test/consensus.17.out b/test/consensus.17.out new file mode 100644 index 000000000..ee676357c --- /dev/null +++ b/test/consensus.17.out @@ -0,0 +1,2 @@ +>ref +ACsymtCGyT diff --git a/test/consensus.3.out b/test/consensus.3.out index b3bfc3f6c..dad9c7d1b 100644 --- a/test/consensus.3.out +++ b/test/consensus.3.out @@ -10,7 +10,7 @@ NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN TTTAAAAACAAAAAAAAAGAA >2 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN -AAATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATTTCA +ARATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATWTCA GAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGACGTA CGTTGTCGGGACAGCCTTTTTATAAAATAATGTTGAGGCTTTGATACGTCAAAGNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTGCT diff --git a/test/test.pl b/test/test.pl index 7c6b736a9..b5a138d70 100755 --- a/test/test.pl +++ b/test/test.pl @@ -629,6 +629,7 @@ test_vcf_consensus($opts,in=>'consensus.13',out=>'consensus.13.out',fa=>'consensus.13.fa',args=>''); test_vcf_consensus($opts,in=>'consensus.14',out=>'consensus.14.out',fa=>'consensus.14.fa',args=>''); test_vcf_consensus($opts,in=>'consensus.12',out=>'consensus.15.out',fa=>'consensus.12.fa',args=>'--mark-del - --mark-ins uc --mark-snv uc'); +test_vcf_consensus($opts,in=>'consensus.15',out=>'consensus.17.out',fa=>'consensus.15.fa',args=>'-H I --mark-ins lc --mark-snv lc'); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite test_mpileup($opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite From 2bd08726e1d91d95facadcd879e804e9d6de498b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 2 Feb 2021 16:16:00 +0000 Subject: [PATCH 47/81] Indicate which of the alleles is de novo and report the corresponding VAF --- plugins/trio-dnm2.c | 86 +++++++++++++++++++++++++--------- test/test.pl | 2 +- test/trio-dnm/trio-dnm.6.1.out | 2 + test/trio-dnm/trio-dnm.6.2.out | 4 +- test/trio-dnm/trio-dnm.6.vcf | 2 + 5 files changed, 72 insertions(+), 24 deletions(-) diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 38f80ce15..15663156d 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -66,8 +66,9 @@ trio_t; typedef struct { // combines priors, mutation rates, genotype transmission probability; see init_priors() - double pprob[10][10][10]; // prior probability; the order is father,mother,child - uint8_t denovo[10][10][10]; // is the GT combination not compatible with normal inheritence (0) or is de novo (1) + double pprob[10][10][10]; // prior probability; the order is father,mother,child + uint8_t denovo[10][10][10]; // is the GT combination not compatible with normal inheritence (0) or is de novo (1) + uint8_t denovo_allele[10][10][10]; // which of the alleles is de novo for this configuration } priors_t; @@ -85,7 +86,7 @@ typedef struct trio_t *trio; int has_fmt_ad; int ntrio, mtrio; - int32_t *pl, *ad, *qs, *dnm_qual_int, *vaf; // input FMT/PL, AD, QS values, output DNM and VAF + int32_t *pl, *ad, *qs, *dnm_qual_int, *dnm_allele, *vaf; // input FMT/PL, AD, QS values, output DNM and VAF float *dnm_qual_float; int mpl, mad, mqs; double min_score; @@ -93,7 +94,9 @@ typedef struct double *pl3; // normalized PLs converted to probs for iFATHER,iMOTHER,iCHILD double *qs3; // QS converted to probs for iFATHER,iMOTHER,iCHILD int maprob, mpl3, mqs3, midx, *idx, force_ad, use_model; - char *dnm_score_tag; // the argument of --use tag, by default DNM:int + char *dnm_score_tag, // the argument of --use tag, by default DNM:log + *dnm_vaf_tag, + *dnm_allele_tag; int dnm_score_is_float; // given by e.g. --use tag DNM:float double mrate; // --use mrate, mutation rate double pnoise_abs,pnoise_frac; // --use pn|pnoise or --use pns @@ -133,11 +136,13 @@ static const char *usage_text(void) " -u, --use OPTION[=VALUE] Various options to tweak:\n" " DNG Use the original DeNovoGear model, implies -u dng-priors\n" " dng-priors Use the original DeNovoGear priors (including bugs in prior assignment)\n" - " mrate=NUM Mutation rate for DNG and AC-DNG models [-u mrate=1e-8]\n" + " mrate=NUM Mutation rate [-u mrate=1e-8]\n" " pn=FRAC[,NUM] Tolerance to parental noise or mosaicity, given as fraction of QS or number of reads [-u pn=0,0]\n" " pns=FRAC[,NUM] Same as `pn` but is not applied to alleles observed in both parents [-u pns=0.045,0]\n" " ppl Use parental genotype likelihoods (FMT/PL rather than FMT/QS)\n" " tag=TAG[:phred|log] Annotation to add, either as phred quality (int) or log-scaled (float) [-u tag=DNM:log]\n" + " vaf=TAG The tag name for variant allele fraction annotation to add [VAF]\n" + " va=TAG The tag name for variant allele annotation [VA]\n" " -X, --chrX LIST Regions with the chr X inheritance pattern or one of the predefined lists, exclude PARs [GRCh37]\n" " GRCh37 .. X:1-60000,chrX:1-60000,X:2699521-154931043,chrX:2699521-154931043\n" " GRCh38 .. X:1-9999,chrX:1-9999,X:2781480-155701381,chrX:2781480-155701381\n" @@ -406,7 +411,7 @@ static double init_mf_priors_chrXX(args_t *args, int fi, int mi) error("Fixme: %s:%d\n",__FILE__,__LINE__); return gt_prior; } -static void init_DNG_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob) +static void init_DNG_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob, int *denovo_allele) { int fa = seq1[fi]; int fb = seq2[fi]; @@ -419,6 +424,7 @@ static void init_DNG_tprob_mprob(args_t *args, int fi, int mi, int ci, double *t int nals_mfc = count_unique_alleles(3,gts,include_ref); *tprob = 1; // genotype transmission likelihood L(GC|GM,GF), 0 if not compatible with Mendelian inheritance *mprob = 1 - args->mrate; // probability of mutation + *denovo_allele = ca!=fa && ca!=fb && ca!=ma && ca!=mb ? ca : cb; if ( nals_mfc==4 ) *tprob = 0; // 4 unique alleles @@ -455,7 +461,7 @@ static void init_DNG_tprob_mprob(args_t *args, int fi, int mi, int ci, double *t *tprob = 0.5; } } -static void init_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob) +static void init_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob, int *denovo_allele) { int fa = seq1[fi]; int fb = seq2[fi]; @@ -464,6 +470,8 @@ static void init_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob int ca = seq1[ci]; int cb = seq2[ci]; + *denovo_allele = ca!=fa && ca!=fb && ca!=ma && ca!=mb ? ca : cb; + // tprob .. genotype transmission probability L(GC|GM,GF), 0 if not compatible with Mendelian inheritance // mprob .. probability of mutation @@ -481,13 +489,15 @@ static void init_tprob_mprob(args_t *args, int fi, int mi, int ci, double *tprob else *mprob = args->mrate * args->mrate; } } -static void init_tprob_mprob_chrX(args_t *args, int mi, int ci, double *tprob, double *mprob) +static void init_tprob_mprob_chrX(args_t *args, int mi, int ci, double *tprob, double *mprob, int *denovo_allele) { int ma = seq1[mi]; int mb = seq2[mi]; int ca = seq1[ci]; int cb = seq2[ci]; + *denovo_allele = ca!=ma && ca!=mb ? ca : cb; + if ( ca!=cb ) // male cannot be heterozygous in X *mprob = 0, *tprob = 0; else if ( ca==ma || ca==mb ) // inherited @@ -499,7 +509,7 @@ static void init_tprob_mprob_chrX(args_t *args, int mi, int ci, double *tprob, d else // de novo *mprob = args->mrate, *tprob = 0; } -static void init_tprob_mprob_chrXX(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob) +static void init_tprob_mprob_chrXX(args_t *args, int fi, int mi, int ci, double *tprob, double *mprob, int *denovo_allele) { int fa = seq1[fi]; int fb = seq2[fi]; @@ -508,6 +518,8 @@ static void init_tprob_mprob_chrXX(args_t *args, int fi, int mi, int ci, double int ca = seq1[ci]; int cb = seq2[ci]; + *denovo_allele = ca!=fa && ca!=fb && ca!=ma && ca!=mb ? ca : cb; + if ( fa!=fb ) // father cannot be heterozygous in X *mprob = 0, *tprob = 0; else if ( (ca==fa && (cb==ma||cb==mb)) || (cb==fa && (ca==ma||ca==mb)) ) @@ -537,6 +549,7 @@ static void init_priors(args_t *args, priors_t *priors, init_priors_t type) double gt_prior; // parent genotype probability L(GM,GF) double tprob; // genotype transmission likelihood L(GC|GM,GF), 0 if not compatible with Mendelian inheritance double mprob; // probability of mutation + int allele; // which of the alleles is de novo if ( args->use_dng_priors ) gt_prior = init_DNG_mf_priors(args,fi,mi,ci); else if ( type==autosomal ) @@ -549,16 +562,17 @@ static void init_priors(args_t *args, priors_t *priors, init_priors_t type) error("Can't happen\n"); if ( args->use_dng_priors ) - init_DNG_tprob_mprob(args,fi,mi,ci,&tprob,&mprob); + init_DNG_tprob_mprob(args,fi,mi,ci,&tprob,&mprob,&allele); else if ( type==autosomal ) - init_tprob_mprob(args,fi,mi,ci,&tprob,&mprob); + init_tprob_mprob(args,fi,mi,ci,&tprob,&mprob,&allele); else if ( type==chrX ) - init_tprob_mprob_chrX(args,mi,ci,&tprob,&mprob); + init_tprob_mprob_chrX(args,mi,ci,&tprob,&mprob,&allele); else if ( type==chrXX ) - init_tprob_mprob_chrXX(args,fi,mi,ci,&tprob,&mprob); + init_tprob_mprob_chrXX(args,fi,mi,ci,&tprob,&mprob,&allele); else error("Can't happen\n"); + priors->denovo_allele[fi][mi][ci] = tprob==0 ? allele : INT32_MAX; // the latter should never happen, making it fail deliberately priors->denovo[fi][mi][ci] = tprob==0 ? 1 : 0; priors->pprob[fi][mi][ci] = log(gt_prior * mprob * (tprob==0 ? 1 : tprob)); } @@ -609,8 +623,9 @@ static void init_data(args_t *args) args->hdr_out = bcf_hdr_dup(args->hdr); bcf_hdr_printf(args->hdr_out, "##FORMAT=",args->dnm_score_tag,args->dnm_score_is_float?"Float":"Integer"); + bcf_hdr_printf(args->hdr_out, "##FORMAT=",args->dnm_allele_tag); if ( args->has_fmt_ad ) - bcf_hdr_append(args->hdr_out, "##FORMAT="); + bcf_hdr_printf(args->hdr_out, "##FORMAT=",args->dnm_vaf_tag); int i, n = 0; char **list; @@ -667,17 +682,21 @@ static void init_data(args_t *args) else args->dnm_qual_int = (int32_t*) malloc(sizeof(*args->dnm_qual_int)*bcf_hdr_nsamples(args->hdr)); args->vaf = (int32_t*) malloc(sizeof(*args->vaf)*bcf_hdr_nsamples(args->hdr)); + args->dnm_allele = (int32_t*) malloc(sizeof(*args->dnm_allele)*bcf_hdr_nsamples(args->hdr)); } static void destroy_data(args_t *args) { if ( args->filter ) filter_destroy(args->filter); regidx_destroy(args->chrX_idx); free(args->dnm_score_tag); + free(args->dnm_vaf_tag); + free(args->dnm_allele_tag); free(args->pl3); free(args->aprob); free(args->idx); free(args->dnm_qual_int); free(args->dnm_qual_float); + free(args->dnm_allele); free(args->vaf); free(args->trio); free(args->pl); @@ -786,8 +805,10 @@ static double process_trio_ACM(args_t *args, priors_t *priors, int nals, double if ( priors->denovo[fi][mi][ci] && max < val ) { max = val; - *al0 = cb; - *al1 = ca; + if ( priors->denovo_allele[fi][mi][ci] == ca ) + *al0 = cb, *al1 = ca; + else + *al0 = ca, *al1 = cb; } mi++; } @@ -836,8 +857,10 @@ static double process_trio_DNG(args_t *args, priors_t *priors, int nals, double if ( priors->denovo[fi][mi][ci] && max < val ) { max = val; - *al0 = cb; - *al1 = ca; + if ( priors->denovo_allele[fi][mi][ci] == ca ) + *al0 = cb, *al1 = ca; + else + *al0 = ca, *al1 = cb; } mi++; } @@ -970,6 +993,7 @@ static void process_record(args_t *args, bcf1_t *rec) for (i=0; idnm_qual_float[i]); else for (i=0; idnm_qual_int[i] = bcf_int32_missing; + for (i=0; idnm_allele[i] = bcf_int32_missing; for (i=0; intrio; i++) { if ( args->filter && !args->trio[i].pass ) continue; @@ -1050,6 +1074,7 @@ static void process_record(args_t *args, bcf1_t *rec) if ( score>255 ) score = 255; args->dnm_qual_int[ args->trio[i].idx[iCHILD] ] = round(score); } + args->dnm_allele[ args->trio[i].idx[iCHILD] ] = al1; } if ( n_ad ) @@ -1075,11 +1100,14 @@ static void process_record(args_t *args, bcf1_t *rec) else ret = bcf_update_format_int32(args->hdr_out,rec,args->dnm_score_tag,args->dnm_qual_int,nsmpl); if ( ret ) - error("Failed to write FORMAT/DNM at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + error("Failed to write FORMAT/%s at %s:%"PRId64"\n", args->dnm_score_tag, bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + ret = bcf_update_format_int32(args->hdr_out,rec,args->dnm_allele_tag,args->dnm_allele,nsmpl); + if ( ret ) + error("Failed to write FORMAT/%s at %s:%"PRId64"\n", args->dnm_allele_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); if ( ad_set ) { - if ( bcf_update_format_int32(args->hdr_out,rec,"VAF",args->vaf,nsmpl)!=0 ) - error("Failed to write FORMAT/VAF at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( bcf_update_format_int32(args->hdr_out,rec,args->dnm_vaf_tag,args->vaf,nsmpl)!=0 ) + error("Failed to write FORMAT/%s at %s:%"PRId64"\n", args->dnm_vaf_tag,bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } } if ( bcf_write(args->out_fh, args->hdr_out, rec)!=0 ) error("[%s] Error: cannot write to %s at %s:%"PRId64"\n", __func__,args->output_fname,bcf_seqname(args->hdr,rec),(int64_t)rec->pos+1); @@ -1119,6 +1147,18 @@ static void set_option(args_t *args, char *optarg) free(args->dnm_score_tag); args->dnm_score_tag = strdup(val); } + else if ( !strcasecmp(opt,"vaf") ) + { + if ( !val ) error("Error: expected value with -u vaf, e.g. -u vaf=VAF\n"); + free(args->dnm_vaf_tag); + args->dnm_vaf_tag = strdup(val); + } + else if ( !strcasecmp(opt,"va") ) + { + if ( !val ) error("Error: expected value with -u va, e.g. -u va=VA\n"); + free(args->dnm_allele_tag); + args->dnm_allele_tag = strdup(val); + } else error("Error: the option \"-u %s\" is not recognised\n",optarg); free(opt); } @@ -1127,7 +1167,9 @@ int run(int argc, char **argv) args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->output_fname = "-"; - args->dnm_score_tag = strdup("DNM:phred"); + args->dnm_score_tag = strdup("DNM:phred"); + args->dnm_vaf_tag = strdup("VAF"); + args->dnm_allele_tag = strdup("VA"); args->mrate = 1e-8; args->pnoise_frac = 0.045; args->pnoise_abs = 0; diff --git a/test/test.pl b/test/test.pl index b5a138d70..7b80b0890 100755 --- a/test/test.pl +++ b/test/test.pl @@ -495,7 +495,7 @@ test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.5',out=>'trio-dnm/trio-dnm.5.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss by DNG -test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.2.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); +test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.6',out=>'trio-dnm/trio-dnm.6.2.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\t[\\t%VA]\\n'"); test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u DNG -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); # incorrect miss, low PL test_vcf_plugin($opts,in=>'trio-dnm/trio-dnm.7',out=>'trio-dnm/trio-dnm.7.1.out',cmd=>'+trio-dnm2',args=>"-p proband,father,mother -u tag=DNM:log | $$opts{bin}/bcftools query -f'[\\t%DNM]\\t[\\t%VAF]\\n'"); test_vcf_plugin($opts,in=>'gvcfz',out=>'gvcfz.1.out',cmd=>'+gvcfz',args=>qq[-g 'PASS:GT!="alt"' -a | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%END[\\t%GT][\\t%DP][\\t%GQ][\\t%RGQ]\\n']); diff --git a/test/trio-dnm/trio-dnm.6.1.out b/test/trio-dnm/trio-dnm.6.1.out index 318f7f294..06d167944 100644 --- a/test/trio-dnm/trio-dnm.6.1.out +++ b/test/trio-dnm/trio-dnm.6.1.out @@ -1 +1,3 @@ -5.94778 . . 70 0 0 + -5.94778 . . 70 0 0 + -13.1802 . . 30 0 0 diff --git a/test/trio-dnm/trio-dnm.6.2.out b/test/trio-dnm/trio-dnm.6.2.out index 717b76aa5..f264fff0a 100644 --- a/test/trio-dnm/trio-dnm.6.2.out +++ b/test/trio-dnm/trio-dnm.6.2.out @@ -1 +1,3 @@ - -3.16223e-05 . . 70 0 0 + -3.16223e-05 . . 70 0 0 1 . . + -3.16223e-05 . . 70 0 0 1 . . + -5.98923 . . 30 0 0 0 . . diff --git a/test/trio-dnm/trio-dnm.6.vcf b/test/trio-dnm/trio-dnm.6.vcf index 6a88b1ddd..bede63b81 100644 --- a/test/trio-dnm/trio-dnm.6.vcf +++ b/test/trio-dnm/trio-dnm.6.vcf @@ -9,3 +9,5 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT proband mother father 19 33584985 . C G,<*> 0 . . PL:DP:ADF:ADR:AD:QS 125,0,54,134,75,189:10:2,7,0:1,0,0:3,7,0:85,243,0 0,18,160,18,160,160:6:5,0,0:1,0,0:6,0,0:213,0,0 0,45,231,45,231,231:15:13,0,0:2,0,0:15,0,0:487,0,0 +19 33584985 . C G 0 . . PL:DP:ADF:ADR:AD:QS 125,0,54:10:2,7:1,0:3,7:85,243 0,18,160:6:5,0:1,0:6,0:213,0 0,45,231:15:13,0:2,0:15,0:487,0 +19 33584985 . C G 0 . . PL:DP:ADF:ADR:AD:QS 125,0,54:10:2,7:1,0:3,7:85,243 160,18,0:6:0,5:0,1:0,6:0,213 231,45,0:15:0,13:0,2:0,15:0,487 From dd1e1e50904e77d84ba78a110529e7e84ffe4538 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 29 Jan 2021 13:13:27 +0000 Subject: [PATCH 48/81] Reformat NEWS to be legible on an 80-column terminal Clarify -O/--output-type entry. Remove mention of `--group-samples TAG:file` as that was not present in the previous 1.11 release. --- NEWS | 104 ++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 48 deletions(-) diff --git a/NEWS b/NEWS index a2b0eba64..df0f805df 100644 --- a/NEWS +++ b/NEWS @@ -2,10 +2,11 @@ Changes affecting the whole of bcftools, or multiple commands: -* The output file type (-O, --output-type) needs not to be specified anymore - and is determined from the output file name suffix. +* The output file type is determined from the output file name suffix, where + available, so the -O/--output-type option is often no longer necessary. -* Make F_MISSING in filtering expressions work for sites with multiple ALT alleles (#1343) +* Make F_MISSING in filtering expressions work for sites with multiple + ALT alleles (#1343) Changes affecting specific commands: @@ -14,32 +15,35 @@ Changes affecting specific commands: - New `--rename-annots` option to help fix broken VCFs (#1335) - - New -C option allows to read a long list of options from a file to prevent very - long command lines. + - New -C option allows to read a long list of options from a file to + prevent very long command lines. - - New `append-missing` logic allows annotations to be added for each ALT allele in the - same order as they appear in the VCF. Note that this is not bullet proof. In order for - this to work: + - New `append-missing` logic allows annotations to be added for each ALT + allele in the same order as they appear in the VCF. Note that this is + not bullet proof. In order for this to work: - the annotation file must have one line per ALT allele - - fields must contain a single value as multiple values are appended as they are and - would break the correspondence between the alleles and values + - fields must contain a single value as multiple values are appended + as they are and would break the correspondence between the alleles + and values * bcftools concat: - - Do not phase genotypes by mistake if they are not already phased with `-l` (#1346) + - Do not phase genotypes by mistake if they are not already phased + with `-l` (#1346) * bcftools consensus: - - New `--mask-with`, `--mark-del`, `--mark-ins`, `--mark-snv` (#1382, #1381, #1170) + - New `--mask-with`, `--mark-del`, `--mark-ins`, `--mark-snv` options + (#1382, #1381, #1170) - - Symbolic should have only one REF base. If there are multiple, take POS+1 - as the first deleted base. + - Symbolic should have only one REF base. If there are multiple, + take POS+1 as the first deleted base. - - Make consensus work when the first base of the reference genome is deleted. In this - situation the VCF record has POS=1 and the first REF base cannot precede the event. - (#1330) + - Make consensus work when the first base of the reference genome is + deleted. In this situation the VCF record has POS=1 and the first + REF base cannot precede the event. (#1330) * bcftools +contrast: @@ -47,49 +51,53 @@ Changes affecting specific commands: * bcftools convert: - - Make the --hapsample and --hapsample2vcf options consistent with each other - and with the documentation. + - Make the --hapsample and --hapsample2vcf options consistent with each + other and with the documentation. * bcftools call: - - Revamp of `call -G`, previously sample grouping by population was not truly independent - and could still be influenced by the presence of other sample groups. - - - Explicit --group-samples-tag option instead of the --group-samples TAG:file - functionality (#1370) + - Revamp of `call -G`, previously sample grouping by population was not + truly independent and could still be influenced by the presence of other + sample groups. - Optional addition of INFO/PV4 annotation with `call -a INFO/PV4` - - Remove generation of useless HOB and ICB annotation; use +fill-tags -- -t HWE,ExcHet` instead + - Remove generation of useless HOB and ICB annotation; + use `+fill-tags -- -t HWE,ExcHet` instead - - The `call -f` option was renamed to `-a` to (1) make it consistent with `mpileup` and (2) to indicate - that it includes both INFO and FORMAT annotations, not just FORMAT as previously + - The `call -f` option was renamed to `-a` to (1) make it consistent with + `mpileup` and (2) to indicate that it includes both INFO and FORMAT + annotations, not just FORMAT as previously - - Any sensible Number=R,Type=Integer annotation can be used with -G, such as AD or QS + - Any sensible Number=R,Type=Integer annotation can be used with -G, + such as AD or QS - - Don't trim QUAL; although usefuleness of this change is questionable for true probabilistic - interpretation (such high precision is unrealistic), using QUAL as a score rather than probability - is helpful and permits more fine-grained filtering + - Don't trim QUAL; although usefuleness of this change is questionable for + true probabilistic interpretation (such high precision is unrealistic), + using QUAL as a score rather than probability is helpful and permits more + fine-grained filtering - - Fix a suspected bug in `call -F` in the worst case, for certain improve readability + - Fix a suspected bug in `call -F` in the worst case, for certain improve + readability - `call -C trio` is temporarily disabled * bcftools +fill-tags: - - MAF definition revised for multiallelic sites, the second most common allele is considered to be the - minor allele (#1313) + - MAF definition revised for multiallelic sites, the second most common + allele is considered to be the minor allele (#1313) -* bcftools gtchecK: +* bcftools gtcheck: - - support matching of a single sample against all other samples in the file with - `-s qry:sample -s gt:-`. This was previously not possible, either full cross-check mode had to be run or - a list of pairs/samples had to be created explicitly + - support matching of a single sample against all other samples in the file + with `-s qry:sample -s gt:-`. This was previously not possible, either + full cross-check mode had to be run or a list of pairs/samples had to + be created explicitly * bcftools merge: - - Make `merge -R` behavior consistent with other commands and pull in overlapping - records with POS outside of the regions (#1374) + - Make `merge -R` behavior consistent with other commands and pull in + overlapping records with POS outside of the regions (#1374) - Bug fix (#1353) @@ -117,14 +125,15 @@ Changes affecting specific commands: * bcftools +trio-dnm2: - - Major revamp of +trio-dnm plugin, which is now deprecated and replaced with + - Major revamp of +trio-dnm plugin, which is now deprecated and replaced by +trio-dnm2. The original trio-dnm calling model used genotype likelihoods (PLs) as the - input for calling. However, that is flawed because PLs make assumptions which - are unsuitable for de novo calling: PL(RR) can become bigger than PL(RA) even - when the ALT allele is present in the parents. Note that this is true also - for other programs such as DeNovoGear which rely on the same samtools calculation. + input for calling. However, that is flawed because PLs make assumptions + which are unsuitable for de novo calling: PL(RR) can become bigger than + PL(RA) even when the ALT allele is present in the parents. Note that + this is true also for other programs such as DeNovoGear which rely on + the same samtools calculation. The new recommended workflow is @@ -132,13 +141,12 @@ Changes affecting specific commands: bcftools call -mv -Ou | bcftools +trio-dnm -p proband,father,mother -Oz -o output.vcf.gz - This new version also implements the DeNovoGear model. The original behavior of trio-dnm - is no longer supported. + This new version also implements the DeNovoGear model. The original + behavior of trio-dnm is no longer supported. For more details see http://samtools.github.io/bcftools/trio-dnm.pdf - ## Release 1.11 (22nd September 2020) From 09dca3e7627c4a15ff2768a7555735641b586142 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 3 Feb 2021 14:20:00 +0000 Subject: [PATCH 49/81] User proper-sized max int value and avoid a compiler warning --- plugins/trio-dnm2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 15663156d..6869b3d10 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -572,7 +572,7 @@ static void init_priors(args_t *args, priors_t *priors, init_priors_t type) else error("Can't happen\n"); - priors->denovo_allele[fi][mi][ci] = tprob==0 ? allele : INT32_MAX; // the latter should never happen, making it fail deliberately + priors->denovo_allele[fi][mi][ci] = tprob==0 ? allele : UINT8_MAX; // the latter should never happen, making it fail deliberately priors->denovo[fi][mi][ci] = tprob==0 ? 1 : 0; priors->pprob[fi][mi][ci] = log(gt_prior * mprob * (tprob==0 ? 1 : tprob)); } From 436e78c93035f43f3d5649a8d72fc3d9fc57a5c0 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 4 Feb 2021 10:37:39 +0000 Subject: [PATCH 50/81] Update Makefile dependencies; include directly where needed Add dependencies on $(htslib_hts_os_h) due to recent random number generator changes, and on $(prob1_h) due to recent mcall.c work. Other source files that use strcase*() functions #include themselves, so do so for these source files too. ( is often a byproduct of but POSIX doesn't require that.) --- Makefile | 6 +++--- plugins/trio-dnm2.c | 1 + vcfbuf.c | 1 + version.c | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 86fab623f..90c7e8410 100644 --- a/Makefile +++ b/Makefile @@ -239,7 +239,7 @@ vcfcall.o: vcfcall.c $(htslib_vcf_h) $(htslib_kfunc_h) $(htslib_synced_bcf_reade vcfconcat.o: vcfconcat.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_thread_pool_h) $(bcftools_h) vcfconvert.o: vcfconvert.c $(htslib_faidx_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kseq_h) $(bcftools_h) $(filter_h) $(convert_h) $(tsv2vcf_h) vcffilter.o: vcffilter.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) rbuf.h -vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(bcftools_h) extsort.h +vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_hts_os_h) $(bcftools_h) extsort.h vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h) vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h) @@ -261,7 +261,7 @@ filter.o: filter.c $(htslib_khash_str2int_h) $(htslib_hts_defs_h) $(htslib_vcfut $(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CPPFLAGS) $(PERL_CFLAGS) -c -o $@ $< gvcf.o: gvcf.c $(gvcf_h) $(bcftools_h) kmin.o: kmin.c kmin.h -mcall.o: mcall.c $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(call_h) +mcall.o: mcall.c $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(call_h) $(prob1_h) prob1.o: prob1.c $(prob1_h) vcmp.o: vcmp.c $(htslib_hts_h) $(htslib_vcf_h) vcmp.h ploidy.o: ploidy.c $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_hts_h) $(bcftools_h) $(ploidy_h) @@ -279,7 +279,7 @@ bam_sample.o: bam_sample.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_st version.o: version.h version.c hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h HMM.o: HMM.c $(htslib_hts_h) HMM.h -vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(bcftools_h) $(vcfbuf_h) rbuf.h +vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(vcfbuf_h) rbuf.h extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h) csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 6869b3d10..2ddae136d 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include diff --git a/vcfbuf.c b/vcfbuf.c index 982c658a8..71916bb6b 100644 --- a/vcfbuf.c +++ b/vcfbuf.c @@ -25,6 +25,7 @@ */ #include +#include #include #include #include diff --git a/version.c b/version.c index b340373f8..d06889726 100644 --- a/version.c +++ b/version.c @@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include "bcftools.h" From a73a0c3da6598296aba7e24e83012bc28ef46ec2 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 9 Feb 2021 08:51:47 +0000 Subject: [PATCH 51/81] Autodetect file type, the `view` command was missed by 7496b17 --- vcfview.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vcfview.c b/vcfview.c index 173865f77..ce4c8108c 100644 --- a/vcfview.c +++ b/vcfview.c @@ -221,12 +221,10 @@ static void init_data(args_t *args) } // setup output + const char *tmp = hts_bcf_wmode2(args->output_type,args->fn_out); char modew[8]; - strcpy(modew, "w"); + strcpy(modew,tmp); if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); - if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF - else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF - else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( args->n_threads > 0) From 29fcdba5dbcce6dd63c714e2ad2702da1f0c863f Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 9 Feb 2021 09:58:45 +0000 Subject: [PATCH 52/81] Continuous integration test updates Complete migration to cirrus-ci by disabling travis. Rename .travis directory to vendor-neutral .ci_helpers Change badge in README.md to a cirrus-ci one, and also add badges for Appveyor and the Github downloads count. --- {.travis => .ci_helpers}/clone | 2 +- .cirrus.yml | 4 +- .gitattributes | 4 +- .travis.yml | 70 ---------------------------------- README.md | 4 +- 5 files changed, 9 insertions(+), 75 deletions(-) rename {.travis => .ci_helpers}/clone (90%) delete mode 100644 .travis.yml diff --git a/.travis/clone b/.ci_helpers/clone similarity index 90% rename from .travis/clone rename to .ci_helpers/clone index a561f9150..9913e7719 100755 --- a/.travis/clone +++ b/.ci_helpers/clone @@ -1,5 +1,5 @@ #!/bin/sh -# Usage: .travis/clone REPOSITORY [DIR] [BRANCH] +# Usage: .ci_helpers/clone REPOSITORY [DIR] [BRANCH] # # Creates a shallow clone, checking out the specified branch. If BRANCH is # omitted or if there is no branch with that name, checks out origin/HEAD diff --git a/.cirrus.yml b/.cirrus.yml index c1df245c7..37763567e 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -16,12 +16,12 @@ timeout_in: 10m # Note this only works on the users own forks. Once in the samtools # organisation the branch name becomes pull/. -# Logic for choosing which to use is in the .travis/clone script. +# Logic for choosing which to use is in the .ci_helpers/clone script. # Note we could also use "clone_script" if we want to replace the bcftools # clone with our own commands too. clone_template: &HTSLIB_CLONE htslib_clone_script: | - .travis/clone "git://github.com/${CIRRUS_REPO_OWNER}/htslib" "${HTSDIR}" "${CIRRUS_BRANCH}" + .ci_helpers/clone "git://github.com/${CIRRUS_REPO_OWNER}/htslib" "${HTSDIR}" "${CIRRUS_BRANCH}" #-------------------------------------------------- diff --git a/.gitattributes b/.gitattributes index 5c712451e..9d42c7c43 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,6 +5,8 @@ *.bam -text diff=bam # Omit these files from release tarballs. +/.appveyor.yml export-ignore +/.cirrus.yml export-ignore .git* export-ignore -.travis* export-ignore +.ci_helpers export-ignore README.md export-ignore diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index a882e5722..000000000 --- a/.travis.yml +++ /dev/null @@ -1,70 +0,0 @@ -# Control file for continuous integration testing at http://travis-ci.org/ - -language: c - -matrix: - include: - - os: linux - compiler: clang - env: USE_CONFIG=no - - os: linux - compiler: gcc - env: USE_CONFIG=no - # An unoptimised C99 build, for detecting non-static inline functions - - os: linux - compiler: gcc - env: CFLAGS="-std=gnu99 -O0" USE_CONFIG=no - - os: osx - compiler: clang - env: USE_CONFIG=no - # Test the configure script - - os: linux - compiler: gcc - env: USE_CONFIG=yes - - os: osx - compiler: clang - env: USE_CONFIG=yes - # An optimised build with address, leak and undefined behavior checking - - os: linux - compiler: clang - sudo: required - env: CFLAGS="-fsanitize=address" LDFLAGS="-fsanitize=address" USE_CONFIG=yes - -env: - global: - - HTSDIR=./htslib - -# For linux systems -addons: - apt: - packages: - - liblzma-dev - - libbz2-dev - - libgsl0-dev - -# For MacOSX systems -before_install: - - | - # Removing GSL dependant parts as they are not tested anyway - # if [ "$TRAVIS_OS_NAME" == "osx" ]; then - # brew update && \ - # brew install xz gsl - # fi - -before_script: - # Clone samtools/htslib (or another repository, as specified by a Travis CI - # repository $HTSREPO setting) and check out a corresponding branch with the - # same name, if any, or otherwise the default branch. - - .travis/clone ${HTSREPO:-git://github.com/samtools/htslib.git} $HTSDIR $TRAVIS_BRANCH - -script: | - if test "$USE_CONFIG" = "yes"; then - ( cd "$HTSDIR" && autoreconf -i) && \ - autoreconf -i && \ - ./configure && \ - make && \ - make test - else - make plugindir=$TRAVIS_BUILD_DIR/plugins -e && \ - make -e test-plugins - fi diff --git a/README.md b/README.md index b5e09ebb6..7af3bc82c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -[![Build Status](https://travis-ci.org/samtools/bcftools.svg?branch=develop)](https://travis-ci.org/samtools/bcftools) +[![Build Status](https://api.cirrus-ci.com/github/samtools/bcftools.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/bcftools) +[![Build status](https://ci.appveyor.com/api/projects/status/yanx5wnsdsqm4pay?svg=true)](https://ci.appveyor.com/project/samtools/bcftools) +[![Github All Releases](https://img.shields.io/github/downloads/samtools/bcftools/total.svg)](https://github.com/samtools/bcftools/releases/latest) This is the official development repository for BCFtools. It contains all the vcf* commands which previously lived in the htslib repository (such as vcfcheck, vcfmerge, vcfisec, etc.) From 1b9892fd5abf64eb75d30c1e5e076e95945571ce Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 9 Feb 2021 14:27:39 +0000 Subject: [PATCH 53/81] Do not recommendation `mpileup -C 50`, its efficacy has not been proved --- doc/bcftools.txt | 3 +-- mpileup.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index ace7afc72..5d61abf52 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1779,8 +1779,7 @@ multiple regions and many alignment files are processed. Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of being generated from the mapped position, the new mapping quality is - about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if - enabled, the recommended value for BWA is 50. [0] + about sqrt((INT-q)/INT)*INT. A zero value (the default) disables this functionality. *-d, --max-depth* 'INT':: At a position, read maximally 'INT' reads per input file. Note that diff --git a/mpileup.c b/mpileup.c index 75ba5873d..0bfb72495 100644 --- a/mpileup.c +++ b/mpileup.c @@ -863,7 +863,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) " -A, --count-orphans do not discard anomalous read pairs\n" " -b, --bam-list FILE list of input BAM filenames, one per line\n" " -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n" -" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n" +" -C, --adjust-MQ INT adjust mapping quality [0]\n" " -d, --max-depth INT max raw per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth); fprintf(fp, " -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n" From 6cf24c66932098d19413174d13e02c62b3efdbbb Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 10 Feb 2021 07:22:00 +0000 Subject: [PATCH 54/81] Check the source tag type Throw an error if it does not match VCF specification rather than silently ignoring the problem. Resolves #1398 --- plugins/tag2tag.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/plugins/tag2tag.c b/plugins/tag2tag.c index a356a8444..3ca30e73a 100644 --- a/plugins/tag2tag.c +++ b/plugins/tag2tag.c @@ -1,6 +1,6 @@ /* plugins/tag2tag.c -- convert between similar tags - Copyright (C) 2014-2016 Genome Research Ltd. + Copyright (C) 2014-2021 Genome Research Ltd. Author: Petr Danecek @@ -93,14 +93,15 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) }; int c; char *src_tag = "GP"; + int src_type = BCF_HT_REAL; while ((c = getopt_long(argc, argv, "?hrt:",loptions,NULL)) >= 0) { switch (c) { - case 1 : src_tag = "GP"; mode = GP_TO_GL; break; - case 2 : src_tag = "GL"; mode = GL_TO_PL; break; - case 3 : src_tag = "GP"; mode = GP_TO_GT; break; - case 4 : src_tag = "PL"; mode = PL_TO_GL; break; + case 1 : src_tag = "GP"; mode = GP_TO_GL; src_type = BCF_HT_REAL; break; + case 2 : src_tag = "GL"; mode = GL_TO_PL; src_type = BCF_HT_REAL; break; + case 3 : src_tag = "GP"; mode = GP_TO_GT; src_type = BCF_HT_REAL; break; + case 4 : src_tag = "PL"; mode = PL_TO_GL; src_type = BCF_HT_INT; break; case 'r': drop_source_tag = 1; break; case 't': thresh = atof(optarg); break; case 'h': @@ -127,6 +128,8 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) int tag_id; if ( (tag_id=bcf_hdr_id2int(in_hdr,BCF_DT_ID,src_tag))<0 || !bcf_hdr_idinfo_exists(in_hdr,BCF_HL_FMT,tag_id) ) error("The source tag does not exist: %s\n", src_tag); + if ( bcf_hdr_id2type(in_hdr,BCF_HL_FMT,tag_id) != src_type ) + error("The source tag type does not match the VCF specification, expected Type=%s. Use `bcftools reheader` to fix.\n",src_type==BCF_HT_REAL?"Float":"Integer"); return 0; } From ac7c21d5728d04171b5ce97ea1cf6217ae74e1d5 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 10 Feb 2021 09:03:25 +0000 Subject: [PATCH 55/81] Fix N_PASS and F_PASS to behave according to expectation when reverse logic is used. Note that this fix has the side effect of `query` (or programs like `+trio-stats`) behaving differently with these expressions, operating now in site-oritented rather than sample-oriented mode. For example, the new behavior could be: ``` bcftools query -f'[%POS %SAMPLE %GT\n]' -i'N_PASS(GT="alt")==1' 11 A 0/0 11 B 0/0 11 C 1/1 ``` while previously the same expression would return: ``` 11 C 1/1 ``` Note that the original mode can be mimicked by splitting the filtering into two steps: ``` bcftools view -i'N_PASS(GT="alt")==1' | bcftools query -f'[%POS %SAMPLE %GT\n]' -i'GT="alt"' ``` Resolves #1397 --- NEWS | 14 ++++++++++++++ filter.c | 30 +++++++----------------------- test/query.63.out | 3 +++ test/query.80.out | 2 ++ test/query.81.out | 4 ++++ test/query.filter.11.vcf | 8 ++++++++ test/test.pl | 4 ++++ 7 files changed, 42 insertions(+), 23 deletions(-) create mode 100644 test/query.80.out create mode 100644 test/query.81.out create mode 100644 test/query.filter.11.vcf diff --git a/NEWS b/NEWS index df0f805df..8ec7e47d6 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,20 @@ Changes affecting the whole of bcftools, or multiple commands: * Make F_MISSING in filtering expressions work for sites with multiple ALT alleles (#1343) +* Fix N_PASS and F_PASS to behave according to expectation when reverse + logic is used (#1397). This fix has the side effect of `query` (or + programs like `+trio-stats`) behaving differently with these expressions, + operating now in site-oritented rather than sample-oriented mode. For + example, the new behavior could be: + bcftools query -f'[%POS %SAMPLE %GT\n]' -i'N_PASS(GT="alt")==1' + 11 A 0/0 + 11 B 0/0 + 11 C 1/1 + while previously the same expression would return: + 11 C 1/1 + The original mode can be mimicked by splitting the filtering into two steps: + bcftools view -i'N_PASS(GT="alt")==1' | \ + bcftools query -f'[%POS %SAMPLE %GT\n]' -i'GT="alt"' Changes affecting specific commands: diff --git a/filter.c b/filter.c index bdd7f2b21..ea0fb99d6 100644 --- a/filter.c +++ b/filter.c @@ -1088,34 +1088,18 @@ static int func_npass(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stac if ( nstack==0 ) error("Error parsing the expression\n"); token_t *tok = stack[nstack - 1]; if ( !tok->nsamples ) error("The function %s works with FORMAT fields\n", rtok->tag); - - rtok->nsamples = tok->nsamples; - memcpy(rtok->pass_samples, tok->pass_samples, rtok->nsamples*sizeof(*rtok->pass_samples)); - assert(tok->usmpl); - if ( !rtok->usmpl ) - { - rtok->usmpl = (uint8_t*) malloc(tok->nsamples*sizeof(*rtok->usmpl)); - memcpy(rtok->usmpl, tok->usmpl, tok->nsamples*sizeof(*rtok->usmpl)); - } int i, npass = 0; - for (i=0; insamples; i++) + for (i=0; insamples; i++) { - if ( !rtok->usmpl[i] ) continue; - if ( rtok->pass_samples[i] ) npass++; + if ( !tok->usmpl[i] ) continue; + if ( tok->pass_samples[i] ) npass++; } - - hts_expand(double,rtok->nsamples,rtok->mvalues,rtok->values); - double value = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); - rtok->nval1 = 1; - rtok->nvalues = rtok->nsamples; - - // Set per-sample status so that `query -i 'F_PASS(GT!="mis" & GQ >= 20) > 0.5'` or +trio-stats - // consider only the passing site AND samples. The values for failed samples is set to -1 so - // that it can never conflict with valid expressions. - for (i=0; insamples; i++) - rtok->values[i] = rtok->pass_samples[i] ? value : -1; + hts_expand(double,1,rtok->mvalues,rtok->values); + rtok->nsamples = 0; + rtok->nvalues = 1; + rtok->values[0] = rtok->tag[0]=='N' ? npass : (line->n_sample ? 1.0*npass/line->n_sample : 0); return 1; } diff --git a/test/query.63.out b/test/query.63.out index afeebcbcd..1050a041b 100644 --- a/test/query.63.out +++ b/test/query.63.out @@ -1,3 +1,6 @@ +3157410 C 21 3157410 D 11 +3184885 C 22 3184885 D 12 +3212016 C 91 3212016 D 11 diff --git a/test/query.80.out b/test/query.80.out new file mode 100644 index 000000000..c6f19387c --- /dev/null +++ b/test/query.80.out @@ -0,0 +1,2 @@ +11 A 0/0 +11 B 1/1 diff --git a/test/query.81.out b/test/query.81.out new file mode 100644 index 000000000..0792f0118 --- /dev/null +++ b/test/query.81.out @@ -0,0 +1,4 @@ +10 A 0/0 +10 B 0/0 +12 A 1/1 +12 B 1/1 diff --git a/test/query.filter.11.vcf b/test/query.filter.11.vcf new file mode 100644 index 000000000..8f547631f --- /dev/null +++ b/test/query.filter.11.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FORMAT= +##contig= +##reference=ref.fa +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 10 . G T . . . GT 0/0 0/0 +1 11 . G T . . . GT 0/0 1/1 +1 12 . G T . . . GT 1/1 1/1 diff --git a/test/test.pl b/test/test.pl index 7b80b0890..2c9d23acc 100755 --- a/test/test.pl +++ b/test/test.pl @@ -182,6 +182,10 @@ test_vcf_query($opts,in=>'query.negative',out=>'query.62.out',args=>q[-f'%POS\\t%TAG2\\n' -i'(TAG2>=-129 && TAG2<=-120) || (TAG2>=-32769 && TAG2<=-32760)']); test_vcf_query($opts,in=>'query.negative',out=>'query.62.out',args=>q[-f'%POS\\t%TAGV2\\n' -i'(TAGV2>=-129 && TAGV2<=-120) || (TAGV2>=-32769 && TAGV2<=-32760)']); test_vcf_query($opts,in=>'query',out=>'query.63.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GQ\\n]' -i'N_PASS(GQ<20)==1']); +test_vcf_query($opts,in=>'query.filter.11',out=>'query.80.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GT\\n]' -i'N_PASS(GT="alt")==1']); +test_vcf_query($opts,in=>'query.filter.11',out=>'query.81.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GT\\n]' -i'N_PASS(GT="alt")!=1']); +test_vcf_query($opts,in=>'query.filter.11',out=>'query.81.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GT\\n]' -e'N_PASS(GT="alt")==1']); +test_vcf_query($opts,in=>'query.filter.11',out=>'query.80.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GT\\n]' -e'N_PASS(GT="alt")!=1']); test_vcf_query($opts,in=>'query',out=>'query.64.out',args=>q[-f'%CHROM\\t%POS\\t%INFO\\t%FORMAT\\n' -s D,C]); test_vcf_query($opts,in=>'query.pbinom.1',out=>'query.65.out',args=>q[-f'[%POS %SAMPLE %GT %AD %PBINOM(AD)\\n]' -i'phred(binom(FMT/AD))>=0']); test_vcf_query($opts,in=>'query.filter.6',out=>'query.66.out',args=>q[-f'%POS\\n' -i'POS==16777217 || POS==33554432 || POS=118673904']); From 200bbba92502a81bef4398872ef1e37b4239c148 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 10 Feb 2021 11:47:10 +0000 Subject: [PATCH 56/81] Replace special characters with underscore when generating file names When splitting a VCF by sample, the output files are named by sample names whenever possible. However, this creates problem when sample names contain special characters. Therefore characters space, tab, "/", "\", and ":" are now replaced with underscore. In case of name clashes, a unique numeric suffix (e.g. "-1") is appended. This commit also allows file suffixes to be present in the -S, -G files. Resolves #1404 --- plugins/split.c | 59 +++++++++++++++++++++++++++++++++++----------- test/split.1.1.out | 18 +++++++++----- test/split.1.2.out | 9 ++++--- test/split.1.3.out | 12 +++++++--- test/split.1.4.out | 12 +++++++--- test/split.1.5.out | 12 +++++++--- test/split.1.6.out | 13 ++++++++++ test/split.1.7.out | 15 ++++++++---- test/split.1.8.out | 13 ++++++++++ test/split.2.1.out | 30 +++++++++++++++++++++++ test/split.2.vcf | 8 +++++++ test/test.pl | 10 ++++++-- 12 files changed, 173 insertions(+), 38 deletions(-) create mode 100644 test/split.1.6.out create mode 100644 test/split.1.8.out create mode 100644 test/split.2.1.out create mode 100644 test/split.2.vcf diff --git a/plugins/split.c b/plugins/split.c index e969d46cb..1f283919d 100644 --- a/plugins/split.c +++ b/plugins/split.c @@ -63,6 +63,7 @@ typedef struct int ninfo_tags, minfo_tags, nfmt_tags, mfmt_tags, keep_info, keep_fmt; int argc, region_is_file, target_is_file, output_type; char **argv, *region, *target, *fname, *output_dir, *keep_tags, *samples_fname, *groups_fname; + void *unique_fnames; bcf_hdr_t *hdr_in, *hdr_out; bcf_srs_t *sr; subset_t *sets; @@ -80,8 +81,9 @@ static const char *usage_text(void) { return "\n" - "About: Split VCF by sample, creating single- or multi-sample VCFs.\n" - "\n" + "About: Split VCF by sample, creating single- or multi-sample VCFs. The output files are named\n" + " by sample names whenever possible, with the characters from the set [ \\t:/\\] replaced\n" + " with \"_\", and a unique numeric suffix added in case of name clashes.\n" "Usage: bcftools +split [Options]\n" "Plugin options:\n" " -e, --exclude EXPR exclude sites for which the expression is true (applied on the outputs)\n" @@ -113,8 +115,8 @@ static const char *usage_text(void) " \n" " # Optional third column to provide output file base name, use dash \"-\"\n" " # to keep sample names unchanged\n" - " sample1 new-name1 output1\n" - " sample2,sample3 - output2\n" + " sample1 new-name1 file1\n" + " sample2,sample3 - file2\n" " \n" " -t, --targets REGION similar to -r but streams rather than index-jumps\n" " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" @@ -137,6 +139,29 @@ static const char *usage_text(void) void mkdir_p(const char *fmt, ...); +static char *create_unique_file_name(args_t *args, const char *template) +{ + kstring_t str = {0,0,0}; + kputs(template, &str); + char *ptr = str.s; + while ( *ptr ) + { + if ( *ptr==':' || *ptr=='\\' || *ptr=='/' || *ptr==' ' || *ptr=='\t' ) *ptr = '_'; + ptr++; + } + size_t ori_len = str.l; + int id = 0; + if ( !args->unique_fnames ) args->unique_fnames = khash_str2int_init(); + while ( khash_str2int_has_key(args->unique_fnames,str.s) ) + { + str.l = ori_len; + kputc('-', &str); + kputw(++id, &str); + } + khash_str2int_inc(args->unique_fnames, strdup(str.s)); + return str.s; +} + void init_subsets(args_t *args) { int i,j, nsmpl = bcf_hdr_nsamples(args->hdr_in); @@ -150,7 +175,7 @@ void init_subsets(args_t *args) set->nsmpl = 1; set->smpl = (int*) calloc(1, sizeof(*set->smpl)); set->smpl[0] = i; - set->fname = strdup(args->hdr_in->samples[i]); + set->fname = create_unique_file_name(args, args->hdr_in->samples[i]); } } else if ( args->samples_fname ) @@ -222,7 +247,7 @@ void init_subsets(args_t *args) error("Expected the same number of samples in the first and second column: %s\n",files[i]); } if ( j ) - set->fname = strdup(set->rename[0]); + set->fname = create_unique_file_name(args, set->rename[0]); else { free(set->rename); @@ -234,11 +259,11 @@ void init_subsets(args_t *args) if ( *ptr ) // optional third column with file name { free(set->fname); - set->fname = strdup(ptr); + set->fname = create_unique_file_name(args, ptr); } if ( !set->fname ) - set->fname = strdup(args->hdr_in->samples[set->smpl[0]]); + set->fname = create_unique_file_name(args, args->hdr_in->samples[set->smpl[0]]); args->nsets++; } @@ -317,7 +342,7 @@ void init_subsets(args_t *args) set->rename = (char**) realloc(set->rename,set->nsmpl*sizeof(*set->rename)); set->smpl[set->nsmpl-1] = idx; set->rename[set->nsmpl-1] = strdup(rename?rename:samples[i]); - if ( !set->fname) set->fname = strdup(beg); + if ( !set->fname) set->fname = create_unique_file_name(args, beg); if ( !tmp ) break; beg = ptr + 1; } @@ -327,6 +352,7 @@ void init_subsets(args_t *args) free(str.s); free(samples); } + if ( args->unique_fnames ) khash_str2int_destroy_free(args->unique_fnames); } static void init_data(args_t *args) @@ -423,12 +449,17 @@ static void init_data(args_t *args) str.l = 0; kputs(args->output_dir, &str); if ( str.s[str.l-1] != '/' ) kputc('/', &str); - int k, l = str.l; kputs(set->fname, &str); - for (k=l; koutput_type & FT_BCF ) kputs(".bcf", &str); - else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); - else kputs(".vcf", &str); + char *suffix = NULL; + if ( args->output_type & FT_BCF ) suffix = "bcf"; + else if ( args->output_type & FT_GZ ) suffix = ".vcf.gz"; + else suffix = ".vcf"; + int len = strlen(set->fname); + if ( len >= 4 && !strcasecmp(".bcf",set->fname+len-4) ) suffix = NULL; + if ( len >= 4 && !strcasecmp(".vcf",set->fname+len-4) ) suffix = NULL; + if ( len >= 7 && !strcasecmp(".vcf.gz",set->fname+len-7) ) suffix = NULL; + if ( len >= 8 && !strcasecmp(".vcf.bgz",set->fname+len-8) ) suffix = NULL; + if ( suffix ) kputs(suffix, &str); set->fh = hts_open(str.s, hts_bcf_wmode2(args->output_type,str.s)); if ( set->fh == NULL ) error("[%s] Error: cannot write to \"%s\": %s\n", __func__, str.s, strerror(errno)); if ( args->hts_opts ) diff --git a/test/split.1.1.out b/test/split.1.1.out index 040d87346..45a75b722 100644 --- a/test/split.1.1.out +++ b/test/split.1.1.out @@ -1,24 +1,30 @@ -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A1 +A1.vcf +A1 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A2 +A2.vcf +A2 22 10 . C A . . . GT 0/1 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/0 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A3 +A3.vcf +A3 22 10 . C A . . . GT 0/0 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/0 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B1 +B1.vcf +B1 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B2 +B2.vcf +B2 22 10 . C A . . . GT 0/0 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/0 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C +C.vcf +C 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 1/0 diff --git a/test/split.1.2.out b/test/split.1.2.out index dd85bab45..554f36b71 100644 --- a/test/split.1.2.out +++ b/test/split.1.2.out @@ -1,12 +1,15 @@ -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +A.vcf +A 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B +B.vcf +B 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C +C.vcf +C 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 1/0 diff --git a/test/split.1.3.out b/test/split.1.3.out index 46989c4c1..ce0f304b0 100644 --- a/test/split.1.3.out +++ b/test/split.1.3.out @@ -1,12 +1,18 @@ -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a3 a2 a1 +a3.vcf +a3 +a2 +a1 22 10 . C A . . . GT 0/0 0/1 ./. 22 22 . A C . . . GT 0/0 0/0 0/0 22 23 . C A . . . GT 0/0 0/0 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT b2 b1 +b2.vcf +b2 +b1 22 10 . C A . . . GT 0/0 ./. 22 22 . A C . . . GT 0/0 0/0 22 23 . C A . . . GT 0/0 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT c +c.vcf +c 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 1/0 diff --git a/test/split.1.4.out b/test/split.1.4.out index c6da58121..cc816c82c 100644 --- a/test/split.1.4.out +++ b/test/split.1.4.out @@ -1,4 +1,10 @@ -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a3 a2 a1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT b2 b1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT c +a3.vcf +a3 +a2 +a1 +b2.vcf +b2 +b1 +c.vcf +c 22 23 . C A . . . GT 1/0 diff --git a/test/split.1.5.out b/test/split.1.5.out index bb134b08e..4e0e6b432 100644 --- a/test/split.1.5.out +++ b/test/split.1.5.out @@ -1,7 +1,13 @@ -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a3 a2 a1 +a3.vcf +a3 +a2 +a1 22 10 . C A . . . GT 0/0 0/1 ./. 22 23 . C A . . . GT 0/0 0/0 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT b2 b1 +b2.vcf +b2 +b1 22 23 . C A . . . GT 0/0 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT c +c.vcf +c 22 23 . C A . . . GT 1/0 diff --git a/test/split.1.6.out b/test/split.1.6.out new file mode 100644 index 000000000..4157100a5 --- /dev/null +++ b/test/split.1.6.out @@ -0,0 +1,13 @@ +file1.vcf +a3 +a2 +a1 +22 10 . C A . . . GT 0/0 0/1 ./. +22 23 . C A . . . GT 0/0 0/0 0/1 +file2.vcf +b2 +b1 +22 23 . C A . . . GT 0/0 0/1 +file3.vcf +c +22 23 . C A . . . GT 1/0 diff --git a/test/split.1.7.out b/test/split.1.7.out index 8c6cf7b16..d5495d19f 100644 --- a/test/split.1.7.out +++ b/test/split.1.7.out @@ -1,16 +1,23 @@ -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a3 C +A.vcf +a3 +C 22 10 . C A . . . GT 0/0 ./. 22 22 . A C . . . GT 0/0 0/0 22 23 . C A . . . GT 0/0 1/0 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A1 +A1.vcf +A1 22 10 . C A . . . GT ./. 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/1 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT a2 +A2.vcf +a2 22 10 . C A . . . GT 0/1 22 22 . A C . . . GT 0/0 22 23 . C A . . . GT 0/0 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT b1 b2 C +B.vcf +b1 +b2 +C 22 10 . C A . . . GT ./. 0/0 ./. 22 22 . A C . . . GT 0/0 0/0 0/0 22 23 . C A . . . GT 0/1 0/0 1/0 diff --git a/test/split.1.8.out b/test/split.1.8.out new file mode 100644 index 000000000..3ca1ccb93 --- /dev/null +++ b/test/split.1.8.out @@ -0,0 +1,13 @@ +file1.vcf +a3 +a2 +a1 +22 10 . C A . . . GT 0/0 0/1 ./. +22 23 . C A . . . GT 0/0 0/0 0/1 +file2.vcf.gz +b2 +b1 +22 23 . C A . . . GT 0/0 0/1 +file3.bcf +c +22 23 . C A . . . GT 1/0 diff --git a/test/split.2.1.out b/test/split.2.1.out new file mode 100644 index 000000000..b99e4c78e --- /dev/null +++ b/test/split.2.1.out @@ -0,0 +1,30 @@ +A1_A1-1.vcf +A1/A1-1 +22 10 . C A . . . GT ./. +22 22 . A C . . . GT 0/0 +22 23 . C A . . . GT 0/1 +A1_A1-2.vcf +A1:A1 +22 10 . C A . . . GT 0/0 +22 22 . A C . . . GT 0/0 +22 23 . C A . . . GT 0/0 +A1_A1.vcf +A1\A1 +22 10 . C A . . . GT 0/1 +22 22 . A C . . . GT 0/0 +22 23 . C A . . . GT 0/0 +B1.vcf +B1 +22 10 . C A . . . GT ./. +22 22 . A C . . . GT 0/0 +22 23 . C A . . . GT 0/1 +B2.vcf +B2 +22 10 . C A . . . GT 0/0 +22 22 . A C . . . GT 0/0 +22 23 . C A . . . GT 0/0 +C.vcf +C +22 10 . C A . . . GT ./. +22 22 . A C . . . GT 0/0 +22 23 . C A . . . GT 1/0 diff --git a/test/split.2.vcf b/test/split.2.vcf new file mode 100644 index 000000000..5f495107f --- /dev/null +++ b/test/split.2.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##reference=ref.fasta +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A1/A1-1 A1\A1 A1:A1 B1 B2 C +22 10 . C A . . . GT ./. 0/1 0/0 ./. 0/0 ./. +22 22 . A C . . . GT 0/0 0/0 0/0 0/0 0/0 0/0 +22 23 . C A . . . GT 0/1 0/0 0/0 0/1 0/0 1/0 diff --git a/test/test.pl b/test/test.pl index 2c9d23acc..769f94340 100755 --- a/test/test.pl +++ b/test/test.pl @@ -549,8 +549,9 @@ test_plugin_split($opts,in=>'split.1',out=>'split.1.3.out',tmp=>'split.1.3',args=>'-S {PATH}/split.smpl.1.3.txt'); test_plugin_split($opts,in=>'split.1',out=>'split.1.4.out',tmp=>'split.1.4',args=>q[-S {PATH}/split.smpl.1.3.txt -i 'GT[0]="alt"']); test_plugin_split($opts,in=>'split.1',out=>'split.1.5.out',tmp=>'split.1.5',args=>q[-S {PATH}/split.smpl.1.3.txt -i 'GT="alt"']); -test_plugin_split($opts,in=>'split.1',out=>'split.1.5.out',tmp=>'split.1.6',args=>q[-S {PATH}/split.smpl.1.4.txt -i 'GT="alt"']); +test_plugin_split($opts,in=>'split.1',out=>'split.1.6.out',tmp=>'split.1.6',args=>q[-S {PATH}/split.smpl.1.4.txt -i 'GT="alt"']); test_plugin_split($opts,in=>'split.1',out=>'split.1.7.out',tmp=>'split.1.7',args=>q[-G {PATH}/split.grp.1.1.txt]); +test_plugin_split($opts,in=>'split.2',out=>'split.2.1.out',tmp=>'split.2.1',args=>q[]); test_plugin_scatter($opts,in=>'scatter.1',out=>'scatter.1.1.out',tmp=>'scatter.1.1',args=>q[-n 3]); test_plugin_scatter($opts,in=>'scatter.1',out=>'scatter.1.2.out',tmp=>'scatter.1.2',args=>q[-s 21,22]); test_plugin_scatter($opts,in=>'scatter.1',out=>'scatter.1.3.out',tmp=>'scatter.1.3',args=>q[-s 21,22 -x X]); @@ -1661,7 +1662,12 @@ sub test_plugin_split closedir($dh) or failed($opts,$test,"Close failed: $$opts{tmp}/$args{tmp}"); my $files = join(' ',@files); - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools +split $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args} && cd $$opts{tmp}/$args{tmp} && cat $files | grep -v ^##"); + test_cmd($opts,%args, + cmd=> + "$$opts{bin}/bcftools +split $$opts{path}/$args{in}.vcf -o $$opts{tmp}/$args{tmp} $args{args} " . + " && cd $$opts{tmp}/$args{tmp} " . + " && for f in $files; do echo \$f; $$opts{bin}/bcftools query -l \$f; $$opts{bin}/bcftools view -H \$f; done" + ); } sub test_plugin_scatter { From ba6508a9f2781d7b25e67e32cc06412b1d887b65 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 10 Feb 2021 13:47:09 +0000 Subject: [PATCH 57/81] Fix typo so MinGW users' $CPPFLAGS aren't ignored --- configure.ac | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 7c3cde36c..7bf448793 100644 --- a/configure.ac +++ b/configure.ac @@ -129,11 +129,11 @@ AS_IF([test "$enable_bcftools_plugins" != "no"], [dnl PLATFORM=MSYS PLUGIN_EXT=.dll HTSLIB_DLL=hts.dll.a - # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, + # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPCFLAGS -D_XOPEN_SOURCE=600"], - + CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600"], + [*-darwin* | *-Darwin*],[dnl host_result="Darwin dylib" PLATFORM=Darwin @@ -271,7 +271,7 @@ case $host_alias in # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPCFLAGS -D_XOPEN_SOURCE=600" + CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" ;; esac From fb972997b010a848b822c394930def30e0fc5b63 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 11 Feb 2021 08:48:30 +0000 Subject: [PATCH 58/81] Make `consensus -I` and `-H I` work with arbitrary ploidy. Resolves #1400 --- bcftools.h | 49 ++++++++++--------- consensus.c | 109 ++++++++++++++++++++++++------------------ doc/bcftools.txt | 4 +- test/consensus.16.out | 2 +- test/consensus.16.vcf | 22 +++++++++ test/consensus.18.out | 20 ++++++++ test/consensus.3.out | 2 +- test/test.pl | 2 + 8 files changed, 138 insertions(+), 72 deletions(-) create mode 100644 test/consensus.16.vcf create mode 100644 test/consensus.18.out diff --git a/bcftools.h b/bcftools.h index fc5d07076..c189ce3a0 100644 --- a/bcftools.h +++ b/bcftools.h @@ -1,6 +1,6 @@ /* bcftools.h -- utility function declarations. - Copyright (C) 2013 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -54,30 +54,35 @@ const char *hts_bcf_wmode2(int file_type, char *fname); void *smalloc(size_t size); // safe malloc -static inline int is_acgtn(char nt) +static inline int iupac2bitmask(char iupac) { - if ( nt < 65 ) return 0; - if ( nt > 84 ) nt -= 32; // to uppercase - if ( nt=='A' || nt=='C' || nt=='G' || nt=='T' || nt=='N' ) return 1; - return 0; + const int A = 1; + const int C = 2; + const int G = 4; + const int T = 8; + if ( iupac >= 97 ) iupac -= 32; + if ( iupac == 'A' ) return A; + if ( iupac == 'C' ) return C; + if ( iupac == 'G' ) return G; + if ( iupac == 'T' ) return T; + if ( iupac == 'M' ) return A|C; + if ( iupac == 'R' ) return A|G; + if ( iupac == 'W' ) return A|T; + if ( iupac == 'S' ) return C|G; + if ( iupac == 'Y' ) return C|T; + if ( iupac == 'K' ) return G|T; + if ( iupac == 'V' ) return A|C|G; + if ( iupac == 'H' ) return A|C|T; + if ( iupac == 'D' ) return A|G|T; + if ( iupac == 'B' ) return C|G|T; + if ( iupac == 'N' ) return A|C|G|T; + return -1; } - -static inline char gt2iupac(char a, char b) +static inline char bitmask2iupac(int bitmask) { - static const char iupac[4][4] = { {'A','M','R','W'},{'M','C','S','Y'},{'R','S','G','K'},{'W','Y','K','T'} }; - if ( a>='a' ) a -= 'a' - 'A'; - if ( b>='a' ) b -= 'a' - 'A'; - if ( a=='A' ) a = 0; - else if ( a=='C' ) a = 1; - else if ( a=='G' ) a = 2; - else if ( a=='T' ) a = 3; - else return 'N'; - if ( b=='A' ) b = 0; - else if ( b=='C' ) b = 1; - else if ( b=='G' ) b = 2; - else if ( b=='T' ) b = 3; - else return 'N'; - return iupac[(int)a][(int)b]; + const char iupac[16] = {'.','A','C','M','G','R','S','V','T','W','Y','H','K','D','B','N'}; + if ( bitmask <= 0 || bitmask > 15 ) return 0; + return iupac[bitmask]; } static inline int iupac_consistent(char iupac, char nt) diff --git a/consensus.c b/consensus.c index d06ebd90a..bd5535e56 100644 --- a/consensus.c +++ b/consensus.c @@ -116,6 +116,8 @@ typedef struct FILE *fp_chain; char **argv; int argc, output_iupac, haplotype, allele, isample, napplied; + uint8_t *iupac_bitmask; + int miupac_bitmask; char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; char mark_del, mark_ins, mark_snv; } @@ -284,6 +286,7 @@ static void add_mask_with(args_t *args, char *with) } static void destroy_data(args_t *args) { + free(args->iupac_bitmask); if (args->filter) filter_destroy(args->filter); bcf_sr_destroy(args->files); int i; @@ -481,7 +484,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) if ( args->absent_allele ) apply_absent(args, rec->pos); if ( rec->n_allele==1 && !args->missing_allele && !args->absent_allele ) { return; } - int i; + int i,j; if ( args->mask ) { char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid); @@ -508,6 +511,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) enum { use_hap, use_iupac, pick_one } action = use_hap; if ( args->allele==PICK_IUPAC ) { + if ( !args->haplotype ) action = use_iupac; if ( !bcf_gt_is_phased(ptr[0]) && !bcf_gt_is_phased(ptr[fmt->n-1]) ) action = use_iupac; } else if ( args->output_iupac ) action = use_iupac; @@ -546,44 +550,40 @@ static void apply_variant(args_t *args, bcf1_t *rec) } else if ( action==use_iupac ) { - ialt = ptr[0]; - if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) - { - if ( !args->missing_allele ) return; - ialt = -1; - } - else - ialt = bcf_gt_allele(ialt); - - int jalt; - if ( fmt->n>1 ) + ialt = -1; + int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1; + for (i=0; in; i++) { - jalt = ptr[1]; - if ( bcf_gt_is_missing(jalt) ) + if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; } + if ( ptr[i]==bcf_int8_vector_end ) break; + int jalt = bcf_gt_allele(ptr[i]); + if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + if ( fallback_alt <= 0 ) fallback_alt = jalt; + + int l = strlen(rec->d.allele[jalt]); + for (j=0; jd.allele[jalt][j]) < 0 ) break; + if ( j mlen ) { - if ( !args->missing_allele ) return; - ialt = -1; + hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); + for (j=mlen; jiupac_bitmask[j] = 0; + mlen = l; } - else if ( jalt==bcf_int32_vector_end ) jalt = ialt; - else - jalt = bcf_gt_allele(jalt); - } - else jalt = ialt; - - if ( ialt==0 && jalt>0 ) ialt = jalt, jalt = 0; - if ( ialt>0 && ialt!=jalt ) - { - if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - i = 0; - while ( rec->d.allele[ialt][i] && rec->d.allele[jalt][i] ) + if ( jalt>0 && l>alen ) { - char ial = rec->d.allele[ialt][i]; - char jal = rec->d.allele[jalt][i]; - if ( !is_acgtn(ial) || !is_acgtn(jal) ) break; - rec->d.allele[ialt][i] = gt2iupac(ial,jal); - i++; + alen = l; + ialt = jalt; } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]); } + if ( alen > 0 ) + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + else if ( fallback_alt >= 0 ) + ialt = fallback_alt; + else if ( is_missing && !args->missing_allele ) return; } else { @@ -636,17 +636,34 @@ static void apply_variant(args_t *args, bcf1_t *rec) } if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); } - else if ( args->output_iupac && ialt>0 ) + else if ( args->output_iupac && rec->n_allele>1 ) { - i = 0; - while ( rec->d.allele[ialt][i] && rec->d.allele[0][i] ) + int ialt, alen = 0, mlen = 0; + for (i=0; in_allele; i++) { - char ial = rec->d.allele[ialt][i]; - char jal = rec->d.allele[0][i]; - if ( !is_acgtn(ial) || !is_acgtn(jal) ) break; - rec->d.allele[ialt][i] = gt2iupac(ial,jal); - i++; + int l = strlen(rec->d.allele[i]); + for (j=0; jd.allele[i][j]) < 0 ) break; + if ( j mlen ) + { + hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); + for (j=mlen; jiupac_bitmask[j] = 0; + mlen = l; + } + if ( i>0 && l>alen ) + { + alen = l; + ialt = i; + } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]); } + if ( alen > 0 ) + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + else + ialt = 1; } if ( rec->n_allele==1 && ialt!=-1 ) @@ -790,9 +807,9 @@ static void apply_variant(args_t *args, bcf1_t *rec) } error( "The fasta sequence does not match the REF allele at %s:%"PRId64":\n" - " .vcf: [%s] <- (REF)\n" - " .vcf: [%s] <- (ALT)\n" - " .fa: [%s]%c%s\n", + " REF .vcf: [%s]\n" + " ALT .vcf: [%s]\n" + " REF .fa : [%s]%c%s\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1, rec->d.allele[0], alt_allele, args->fa_buf.s+idx, tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:"" ); @@ -1047,8 +1064,8 @@ static void usage(args_t *args) fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); fprintf(stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); - fprintf(stderr, " -m, --mask FILE replace regions with N\n"); - fprintf(stderr, " --mask-with CHAR|lc|uc replace regions with N\n"); + fprintf(stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n"); + fprintf(stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n"); fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 5d61abf52..66c5e1932 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -817,7 +817,7 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the *-f, --fasta-ref* 'FILE':: reference sequence in fasta format -*-H, --haplotype* '1'|'2'|'R'|'A'|'LR'|'LA'|'SR'|'SA'|'1pIu'|'2pIu':: +*-H, --haplotype* '1'|'2'|'R'|'A'|'I'|'LR'|'LA'|'SR'|'SA'|'1pIu'|'2pIu':: choose which allele from the FORMAT/GT field to use (the codes are case-insensitive): '1';; @@ -869,7 +869,7 @@ depth information, such as INFO/AD or FORMAT/AD. For that, consider using the format details. *--mask-with* 'CHAR'|'lc'|'uc':: - replace sequence from *--mask* with CHAR or change to lowercase (lc) or uppercase (uc) + replace sequence from *--mask* with CHAR, skipping overlapping variants, or change to lowercase (lc) or uppercase (uc) *-M, --missing* 'CHAR':: instead of skipping the missing genotypes, output the character CHAR (e.g. "?") diff --git a/test/consensus.16.out b/test/consensus.16.out index 3921c98b1..30a95daa8 100644 --- a/test/consensus.16.out +++ b/test/consensus.16.out @@ -16,5 +16,5 @@ CGTTGTCGGGACAGCCTTTTTATAAAATAATGTTGAGGCTTTGATACGTCAAAGxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxTTTGCT GCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGTTACA TGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGACTCCT -CTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATTYCAGAC +CTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATTHCAGAC ACAGTTAATCCAGAC diff --git a/test/consensus.16.vcf b/test/consensus.16.vcf new file mode 100644 index 000000000..65220b0a5 --- /dev/null +++ b/test/consensus.16.vcf @@ -0,0 +1,22 @@ +##fileformat=VCFv4.2 +##FORMAT= +##reference=file://some/path/human_g1k_v37.fasta +##INFO= +##ALT= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA001 +1 5 . C a . PASS . GT 0/1 +1 5 . C t . PASS . GT 0/1 +1 7 . T a . PASS . GT 0/1 +1 10 . G a . PASS . GT 0/1 +1 12 . GACA ga . PASS . GT 0/1 +1 16 . T taaa . PASS . GT 0/1 +1 19 . A c . PASS . GT 0/1 +1 61 . C a . PASS . GT 0/1 +2 61 . AGAG aa . PASS . GT 0/1 +2 119 . AAA t . PASS . GT 0/1 +2 179 . G gacgtacgt . PASS . GT 0/1 +2 200 . A . PASS END=210 GT 1/0 +2 481 . T c,a . PASS . GT 0/1/2 +2 484 . G cc,aa . PASS . GT 0/1/2 diff --git a/test/consensus.18.out b/test/consensus.18.out new file mode 100644 index 000000000..1d6f22961 --- /dev/null +++ b/test/consensus.18.out @@ -0,0 +1,20 @@ +>1:2-501 +TACMAWATRTGATAAAATMAAAAAGAACATAACCTACGTATCAACTAAAGTGGTTGTTTG +MAGAAAAGGAAGACTTAAAAAGAGTCAGTACTAACCTACATAATATATACAATGTTCATT +AAATAATAAAATGAGCTCATCATACTTAGGTCATCATAAATATATCTGAAATTCACAAAT +ATTGATCAAATGGTAAAATAGACAAGTAGATTTTAATAGGTTAAACAATTACTGATTCTC +TTGAAAGAATAAATTTAATATGAGACCTATTTCATTATAATGAACTCACAAATTAGAAAC +TTCACACTGGGGGCTGGAGAGATGGCTCAGTAGTTAAGAACACTGACTGCTCTTCTGAAG +GTCCTGAGTTCAAATCCCAGCAACCACATGGTGACTTACAACCATCTGTAATGACATCTG +ATGCCCTCTGGTGTGTCTGAAGACAGCTACAGTGTACTTACATAAAATAATAAATAAATC +TTTAAAAACAAAAAAAAAGAA +>2 +GAAGATCTTTTCCTTATTAAGGATCTGAAGCTCTGTAGATTTGTATTCTATTAAACATGG +ARATTAGTGATTTTCCATATTCTTTAAGTCATTTTAGAGTAATGTGTTCTTAAGATWTCA +GAAAAACAAAAACTTGTGCTTTCCTGTTTGAAAAACAAACAGCTGTGGGGAATGGACGTA +CGTTGTCGGGACAGCCTTTTTATAAAATAATGTTGAGGCTTTGATACGTCAAAGTTATAT +TTCAAATGGAATCACTTAGACCTCGTTTCTGAGTGTCAATGGCCATATTGGGGATTTGCT +GCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGTTACA +TGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGACTCCT +CTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATTHCAVMA +CACAGTTAATCCAGAC diff --git a/test/consensus.3.out b/test/consensus.3.out index dad9c7d1b..65f53708e 100644 --- a/test/consensus.3.out +++ b/test/consensus.3.out @@ -16,5 +16,5 @@ CGTTGTCGGGACAGCCTTTTTATAAAATAATGTTGAGGCTTTGATACGTCAAAGNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTGCT GCTGCCAATGACAGCACACCCTGGGAATGCCCCAACTACTTACTACAAAGCAGTGTTACA TGGAGAAGATCTTCAAGAGTCTTTTTGCTAGATCTTTCCTTGGCTTTTGATGTGACTCCT -CTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATTYCAGAC +CTCAATAAAATCCACAGTAATATAGTGAGTGGTCTCCTGCTCCAAACCAGTATTHCAGAC ACAGTTAATCCAGAC diff --git a/test/test.pl b/test/test.pl index 769f94340..d4df585a6 100755 --- a/test/test.pl +++ b/test/test.pl @@ -635,6 +635,8 @@ test_vcf_consensus($opts,in=>'consensus.14',out=>'consensus.14.out',fa=>'consensus.14.fa',args=>''); test_vcf_consensus($opts,in=>'consensus.12',out=>'consensus.15.out',fa=>'consensus.12.fa',args=>'--mark-del - --mark-ins uc --mark-snv uc'); test_vcf_consensus($opts,in=>'consensus.15',out=>'consensus.17.out',fa=>'consensus.15.fa',args=>'-H I --mark-ins lc --mark-snv lc'); +test_vcf_consensus($opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-I'); +test_vcf_consensus($opts,in=>'consensus.16',out=>'consensus.18.out',fa=>'consensus.fa',args=>'-H I'); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.1.out',args=>q[-r17:100-150],test_list=>1); test_mpileup($opts,in=>[qw(mpileup.1 mpileup.2 mpileup.3)],out=>'mpileup/mpileup.2.out',args=>q[-a DP,DV -r17:100-600]); # test files from samtools mpileup test suite test_mpileup($opts,in=>[qw(mpileup.1)],out=>'mpileup/mpileup.3.out',args=>q[-B --ff 0x14 -r17:1050-1060]); # test file converted to vcf from samtools mpileup test suite From b7b1b7c4f62a2c8aeff69f087444b0c4f7ce85f3 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 11 Feb 2021 10:16:50 +0000 Subject: [PATCH 59/81] Fix wrong check of strchr() return value, avoid segfault with -u tag=DNG --- plugins/trio-dnm2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 2ddae136d..4f0e0a8a9 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -583,7 +583,7 @@ static void init_priors(args_t *args, priors_t *priors, init_priors_t type) static void init_data(args_t *args) { char *ptr = strchr(args->dnm_score_tag,':'); - if ( *ptr ) + if ( ptr ) { if ( ptr==args->dnm_score_tag ) error("Error: could not parse --use tag=%s\n",ptr); *ptr = 0; From cd1f213e091ea11219f2e9759faab97eb9327006 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 16 Feb 2021 13:37:27 +0000 Subject: [PATCH 60/81] Tests for #1414. The pullrequest https://github.com/samtools/htslib/pull/1234 should fix --- test/annotate22.vcf | 6 ++++++ test/annotate30.out | 6 ++++++ test/test.pl | 1 + 3 files changed, 13 insertions(+) create mode 100644 test/annotate22.vcf create mode 100644 test/annotate30.out diff --git a/test/annotate22.vcf b/test/annotate22.vcf new file mode 100644 index 000000000..8763649b6 --- /dev/null +++ b/test/annotate22.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SM +chr1 833602 . G A . . XX=1234 XX 1234 diff --git a/test/annotate30.out b/test/annotate30.out new file mode 100644 index 000000000..bbb1e2f77 --- /dev/null +++ b/test/annotate30.out @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SM +chr1 833602 . G A . . XX=1234 XX 1234 diff --git a/test/test.pl b/test/test.pl index d4df585a6..7cfe6e1c2 100755 --- a/test/test.pl +++ b/test/test.pl @@ -425,6 +425,7 @@ test_vcf_annotate($opts,in=>'annotate.missing-append',tab=>'annotate.missing-append',out=>'annotate.missing-append.1.out',args=>'-c CHROM,POS,REF,ALT,STR,INT,FLT -l STR:append-missing,INT:append-missing,FLT:append-missing'); test_vcf_annotate($opts,in=>'annotate9',tab=>'annots9',out=>'annotate9.out',args=>'-c CHROM,POS,REF,ALT,+ID'); test_vcf_annotate($opts,in=>'annotate21',out=>'annotate29.out',args=>'--rename-annots {PATH}/annotate21.txt'); +test_vcf_annotate($opts,in=>'annotate22',vcf=>'annotate22',out=>'annotate30.out',args=>'-c FMT/XX,INFO/XX -x FMT/XX,INFO/XX'); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+missing2ref --no-version'); test_vcf_plugin($opts,in=>'plugin1',out=>'missing2ref.out',cmd=>'+setGT --no-version',args=>'-- -t . -n 0'); test_vcf_plugin($opts,in=>'setGT',out=>'setGT.1.out',cmd=>'+setGT --no-version',args=>'-- -t q -n 0 -i \'GT~"." && FMT/DP=30 && GQ=150\''); From 249dfc649e76b81e803f007404e043b2e3279cfa Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 16 Feb 2021 14:33:49 +0000 Subject: [PATCH 61/81] Fix transfer of ID The annotate -c INFO/TAG:=ID worked without -h but would silently ignore the request when -h wass provided: did work: bcftools annotate -a annot.vcf.gz -c INFO/TAG:=ID did not work: bcftools annotate -a annot.vcf.gz -c INFO/TAG:=ID -h header.txt The same problem affected transfer of FILTER Fixes #1415 --- vcfannotate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vcfannotate.c b/vcfannotate.c index 1c397c097..87ab5675c 100644 --- a/vcfannotate.c +++ b/vcfannotate.c @@ -2354,14 +2354,12 @@ static void init_columns(args_t *args) // transferring ID column into a new INFO tag tmp.l = 0; ksprintf(&tmp,"##INFO=",key_dst); - col->getter = vcf_getter_id2str; } else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) { // transferring FILTER column into a new INFO tag tmp.l = 0; ksprintf(&tmp,"##INFO=",key_dst); - col->getter = vcf_getter_filter2str; } else { @@ -2390,6 +2388,11 @@ static void init_columns(args_t *args) error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src, args->targets_fname); assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) ); } + if ( args->tgts_is_vcf ) + { + if ( !strcasecmp("ID",key_src) && !explicit_src_info ) col->getter = vcf_getter_id2str; + else if ( !strcasecmp("FILTER",key_src) && !explicit_src_info ) col->getter = vcf_getter_filter2str; + } col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id); switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) ) { From 411f50b6e63af7b47c416ad51c9f44ad6c105157 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 17 Feb 2021 12:48:44 +0000 Subject: [PATCH 62/81] Throw an error on incorrectly formatted --AF-file files. Likely resolves #1417 --- vcfroh.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vcfroh.c b/vcfroh.c index 15b84e66e..8e95c9a79 100644 --- a/vcfroh.c +++ b/vcfroh.c @@ -658,8 +658,10 @@ static void flush_viterbi(args_t *args, int ismpl) } } -int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) +int read_AF(args_t *args, bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq) { + if ( tgt->nals < 2 ) + error("Expected two comma-separated alleles (REF,ALT) in the third column of %s, found:\n\t%s\n", args->af_fname,tgt->line.s); if ( tgt->nals != line->n_allele ) return -1; // number of alleles does not match int i; @@ -839,7 +841,7 @@ int process_line(args_t *args, bcf1_t *line, int ial) else if ( args->af_fname ) { // Read AF from a file - ret = read_AF(args->files->targets, line, &alt_freq); + ret = read_AF(args, args->files->targets, line, &alt_freq); } else if ( args->dflt_AF > 0 ) { From 207a48e602a3857b10268dda805961293e4ba64a Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 19 Feb 2021 15:58:54 +0000 Subject: [PATCH 63/81] Add more make targets to use when building HTSlib with BCFtools * check-all, test-all to run both HTSlib and BCFtools tests * distclean-all, testclean-all to clean up HTSlib as well as BCFtools (clean-all already exists) --- Makefile | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 90c7e8410..929fa5920 100644 --- a/Makefile +++ b/Makefile @@ -302,6 +302,9 @@ check-plugins test-plugins: $(PROGRAMS) $(TEST_PROGRAMS) $(BGZIP) $(TABIX) plugi ./test/test-regidx REF_PATH=: ./test/test.pl --plugins --exec bgzip=$(BGZIP) --exec tabix=$(TABIX) --htsdir=$(HTSDIR) $${TEST_OPTS:-} +# test HTSlib as well, where it is built alongside BCFtools +check-all test-all: test-htslib test + test/test-rbuf.o: test/test-rbuf.c rbuf.h test/test-rbuf: test/test-rbuf.o @@ -353,10 +356,15 @@ distclean: clean clean-all: clean clean-htslib +distclean-all: distclean distclean-htslib + +testclean-all: testclean testclean-htslib + tags: ctags -f TAGS *.[ch] plugins/*.[ch] force: -.PHONY: all check clean clean-all clean-plugins distclean force install -.PHONY: print-version tags test testclean plugins docs +.PHONY: all check check-all clean clean-all clean-plugins +.PHONY: distclean distclean-all force install +.PHONY: print-version tags test test-all testclean testclean-all plugins docs From 2612b3a8ea074ae0fcb1fec308e281f432219111 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 24 Feb 2021 14:59:11 +0000 Subject: [PATCH 64/81] Support for FORMAT/VAF,VAF1 annotations. Resolves #1422 --- NEWS | 3 ++ doc/bcftools.txt | 2 ++ plugins/fill-tags.c | 80 +++++++++++++++++++++++++++++++++++++----- test/fill-tags-VAF.out | 9 +++++ test/fill-tags-VAF.vcf | 6 ++++ test/test.pl | 1 + 6 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 test/fill-tags-VAF.out create mode 100644 test/fill-tags-VAF.vcf diff --git a/NEWS b/NEWS index 8ec7e47d6..4441e7b31 100644 --- a/NEWS +++ b/NEWS @@ -101,6 +101,9 @@ Changes affecting specific commands: - MAF definition revised for multiallelic sites, the second most common allele is considered to be the minor allele (#1313) + - New FORMAT/VAF, VAF1 annotations to set the fraction of alternate reads + provided FORMAT/AD is present + * bcftools gtcheck: - support matching of a single sample against all other samples in the file diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 66c5e1932..86ea82b98 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -2189,6 +2189,8 @@ By default, appropriate system directories are searched for installed plugins. - INFO/MAF Number:A Type:Float .. Minor Allele frequency - INFO/NS Number:1 Type:Integer .. Number of samples with data - INFO/TYPE Number:. Type:String .. The record type (REF,SNP,MNP,INDEL,etc) + - FORMAT/VAF Number:A Type:Float .. The fraction of reads with the alternate allele, requires FORMAT/AD or ADF+ADR + - FORMAT/VAF1 Number:1 Type:Float .. The same as FORMAT/VAF but for all alternate alleles cumulatively - TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined expressions such as "DP=sum(DP)" *fix-ploidy*:: diff --git a/plugins/fill-tags.c b/plugins/fill-tags.c index f1e90c083..bc718035d 100644 --- a/plugins/fill-tags.c +++ b/plugins/fill-tags.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2015-2020 Genome Research Ltd. + Copyright (c) 2015-2021 Genome Research Ltd. Author: Petr Danecek @@ -51,6 +51,8 @@ #define SET_FUNC (1<<10) #define SET_END (1<<11) #define SET_TYPE (1<<12) +#define SET_VAF (1<<13) +#define SET_VAF1 (1<<14) typedef struct _args_t args_t; typedef struct _ftf_t ftf_t; @@ -100,7 +102,7 @@ static args_t *args; const char *about(void) { - return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS and more.\n"; + return "Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS; FORMAT/VAF and more.\n"; } const char *usage(void) @@ -108,7 +110,8 @@ const char *usage(void) return "\n" "About: Set INFO tags AF, AC, AC_Hemi, AC_Hom, AC_Het, AN, ExcHet, HWE, MAF, NS\n" - " or custom INFO/TAG=func(FMT/TAG), use -l for detailed description\n" + " FORMAT tag VAF, custom INFO/TAG=func(FMT/TAG).\n" + " See examples below, run with -l for detailed description.\n" "Usage: bcftools +fill-tags [General Options] -- [Plugin Options]\n" "Options:\n" " run \"bcftools plugin\" for a list of common options\n" @@ -126,7 +129,7 @@ const char *usage(void) " # Fill INFO/AN and INFO/AC\n" " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t AN,AC\n" "\n" - " # Fill all available tags\n" + " # Fill (almost) all available tags\n" " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t all\n" "\n" " # Calculate HWE for sample groups (possibly multiple) read from a file\n" @@ -134,6 +137,9 @@ const char *usage(void) "\n" " # Calculate total read depth (INFO/DP) from per-sample depths (FORMAT/DP)\n" " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t 'DP=sum(DP)'\n" + "\n" + " # Annotate with allelic fraction\n" + " bcftools +fill-tags in.bcf -Ob -o out.bcf -- -t FORMAT/VAF\n" "\n"; } @@ -396,6 +402,8 @@ int parse_tags(args_t *args, const char *str) else if ( !strcasecmp(tags[i],"MAF") ) { flag |= SET_MAF; args->unpack |= BCF_UN_FMT; } else if ( !strcasecmp(tags[i],"HWE") ) { flag |= SET_HWE; args->unpack |= BCF_UN_FMT; } else if ( !strcasecmp(tags[i],"ExcHet") ) { flag |= SET_ExcHet; args->unpack |= BCF_UN_FMT; } + else if ( !strcasecmp(tags[i],"VAF") || !strcasecmp(tags[i],"FORMAT/VAF") ) { flag |= SET_VAF; args->unpack |= BCF_UN_FMT; } + else if ( !strcasecmp(tags[i],"VAF1") || !strcasecmp(tags[i],"FORMAT/VAF1") ) { flag |= SET_VAF1; args->unpack |= BCF_UN_FMT; } else if ( !strcasecmp(tags[i],"END") ) flag |= SET_END; else if ( !strcasecmp(tags[i],"TYPE") ) flag |= SET_TYPE; else if ( !strcasecmp(tags[i],"F_MISSING") ) { flag |= parse_expr_float(args,NULL,"F_MISSING"); args->unpack |= BCF_UN_FMT; } @@ -427,6 +435,8 @@ void list_tags(void) "INFO/MAF Number:1 Type:Float .. Frequency of the second most common allele\n" "INFO/NS Number:1 Type:Integer .. Number of samples with data\n" "INFO/TYPE Number:. Type:String .. The record type (REF,SNP,MNP,INDEL,etc)\n" + "FORMAT/VAF Number:A Type:Float .. The fraction of reads with the alternate allele, requires FORMAT/AD or ADF+ADR\n" + "FORMAT/VAF1 Number:1 Type:Float .. The same as FORMAT/VAF but for all alternate alleles cumulatively\n" "TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined\n" " expressions such as \"DP=sum(DP)\". This is currently very basic, to be extended.\n" ); @@ -484,6 +494,8 @@ int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) if ( args->tags & SET_END ) bcf_hdr_printf(args->out_hdr, "##INFO="); if ( args->tags & SET_TYPE ) bcf_hdr_printf(args->out_hdr, "##INFO="); if ( args->tags & SET_ExcHet ) hdr_append(args, "##INFO="); + if ( args->tags & SET_VAF ) bcf_hdr_append(args->out_hdr, "##FORMAT=tags & SET_VAF1 ) bcf_hdr_append(args->out_hdr, "##FORMAT=fb ) return -1; return 0; } -bcf1_t *process_fmt(bcf1_t *rec) +static void process_fmt(bcf1_t *rec) { bcf_unpack(rec, BCF_UN_FMT); @@ -601,7 +613,7 @@ bcf1_t *process_fmt(bcf1_t *rec) bcf_fmt_t *fmt_gt = NULL; for (i=0; in_fmt; i++) if ( rec->d.fmt[i].id==args->gt_id ) { fmt_gt = &rec->d.fmt[i]; break; } - if ( !fmt_gt ) return rec; // no GT tag + if ( !fmt_gt ) return; // no GT tag hts_expand(int32_t,rec->n_allele, args->miarr, args->iarr); hts_expand(float,rec->n_allele*2, args->mfarr, args->farr); @@ -819,13 +831,65 @@ bcf1_t *process_fmt(bcf1_t *rec) } } } +} +static void process_vaf(bcf1_t *rec, int mode) +{ + int nsmpl = bcf_hdr_nsamples(args->in_hdr); + int nval = args->niarr / nsmpl; + int nval1 = (mode & SET_VAF) ? rec->n_allele - 1 : 1; + int nfarr = nval1 * nsmpl; + hts_expand(float,nfarr,args->mfarr,args->farr); + int i,j; + for (i=0; iiarr + i*nval; + float *dst = args->farr + i*nval1; + float sum = 0; + for (j=0; jout_hdr,rec,(mode & SET_VAF) ? "VAF" : "VAF1", args->farr, nfarr)!=0 ) + error("Error occurred while updating %s at %s:%"PRId64"\n", args->str.s,bcf_seqname(args->in_hdr,rec),(int64_t) rec->pos+1); +} +static void process_vaf_vaf1(bcf1_t *rec) +{ + if ( !(args->tags & (SET_VAF|SET_VAF1)) ) return; + if ( rec->n_allele <= 1 ) return; - return rec; + args->niarr = bcf_get_format_int32(args->in_hdr, rec, "AD", &args->iarr, &args->miarr); + if ( args->niarr <= 0 ) + error("Could not read FORMAT/AD annotation at %s:%"PRIhts_pos"\n",bcf_seqname(args->in_hdr,rec),rec->pos+1); + + int nsmpl = bcf_hdr_nsamples(args->in_hdr); + if ( args->niarr != nsmpl*rec->n_allele ) return; // incorrect number of values (possibly all missing) + + if ( args->tags & SET_VAF ) process_vaf(rec, SET_VAF); + if ( args->tags & SET_VAF1 ) process_vaf(rec, SET_VAF1); } bcf1_t *process(bcf1_t *rec) { - if ( args->unpack & BCF_UN_FMT ) process_fmt(rec); + if ( args->unpack & BCF_UN_FMT ) + { + process_fmt(rec); + process_vaf_vaf1(rec); + } if ( args->tags & SET_END ) { diff --git a/test/fill-tags-VAF.out b/test/fill-tags-VAF.out new file mode 100644 index 000000000..abca307d2 --- /dev/null +++ b/test/fill-tags-VAF.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##FILTER= +##FORMAT= +##contig= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE +chr1 10153 . AC A,C,G . . . AD:VAF:VAF1 1,1,1,1:0.25,0.25,0.25:0.75 +chr1 10153 . AC A,C,G . . . AD . diff --git a/test/fill-tags-VAF.vcf b/test/fill-tags-VAF.vcf new file mode 100644 index 000000000..e868c7e4b --- /dev/null +++ b/test/fill-tags-VAF.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.2 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE +chr1 10153 . AC A,C,G . . . AD 1,1,1,1 +chr1 10153 . AC A,C,G . . . AD . diff --git a/test/test.pl b/test/test.pl index 7cfe6e1c2..7639ace01 100755 --- a/test/test.pl +++ b/test/test.pl @@ -452,6 +452,7 @@ test_vcf_plugin($opts,in=>'fill-tags-hemi',out=>'fill-tags-hemi.2.out',cmd=>'+fill-tags --no-version',args=>'-- -d'); test_vcf_plugin($opts,in=>'fill-tags-hwe',out=>'fill-tags-hwe.out',cmd=>'+fill-tags --no-version'); test_vcf_plugin($opts,in=>'fill-tags-AN0',out=>'fill-tags-AN0.out',cmd=>'+fill-tags --no-version',args=>'-- -t all,END,TYPE,F_MISSING'); +test_vcf_plugin($opts,in=>'fill-tags-VAF',out=>'fill-tags-VAF.out',cmd=>'+fill-tags --no-version',args=>'-- -t VAF,VAF1'); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.out',cmd=>'+GTisec',args=>' | grep -v bcftools'); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.H.out',cmd=>'+GTisec',args=>'-- -H | grep -v bcftools'); test_vcf_plugin($opts,in=>'view',out=>'view.GTisec.Hm.out',cmd=>'+GTisec',args=>'-- -Hm | grep -v bcftools'); From edba0c338d02357a8226a33860116eaf920c9c22 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 24 Feb 2021 15:03:34 +0000 Subject: [PATCH 65/81] Fix a typo, thanks to @tbenavi1. Resolves #1424 --- doc/bcftools.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 86ea82b98..5464096d9 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1632,7 +1632,7 @@ Print a list of records which are present in A and B but not in C and D Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file. For example, when merging file 'A.vcf.gz' containing samples 'S1', 'S2' and 'S3' and file 'B.vcf.gz' containing samples 'S3' and -'S4', the output file will contain four samples named 'S1', 'S2', 'S3', '2:S3' +'S4', the output file will contain five samples named 'S1', 'S2', 'S3', '2:S3' and 'S4'. Note that it is responsibility of the user to ensure that the sample names are From d16d5b43e833217a3b2c441e69bebb1e882c8ec3 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 24 Feb 2021 11:54:23 +0100 Subject: [PATCH 66/81] Add new --atomize and --old-rec-tag options - The `-a, --atomize` option allows to decompose complex variants, for example MNVs into consecutive SNVs - The `--old-rec-tag` options indicates the original variant Resolves #1052 and #128 --- Makefile | 6 +- NEWS | 7 + abuf.c | 665 +++++++++++++++++++++++++++++++++++++ abuf.h | 78 +++++ doc/bcftools.1 | 97 ++++-- doc/bcftools.html | 281 +++++++++------- doc/bcftools.txt | 32 ++ test/atomize.split.1.1.out | 46 +++ test/atomize.split.1.2.out | 46 +++ test/atomize.split.1.vcf | 34 ++ test/atomize.split.2.1.out | 9 + test/atomize.split.2.2.out | 8 + test/atomize.split.2.vcf | 6 + test/test.pl | 4 + vcfnorm.c | 321 +++++++++++------- 15 files changed, 1368 insertions(+), 272 deletions(-) create mode 100644 abuf.c create mode 100644 abuf.h create mode 100644 test/atomize.split.1.1.out create mode 100644 test/atomize.split.1.2.out create mode 100644 test/atomize.split.1.vcf create mode 100644 test/atomize.split.2.1.out create mode 100644 test/atomize.split.2.2.out create mode 100644 test/atomize.split.2.vcf diff --git a/Makefile b/Makefile index 90c7e8410..6662fd8db 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ OBJS = main.o vcfindex.o tabix.o \ vcfcnv.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \ - vcfsort.o cols.o extsort.o dist.o \ + vcfsort.o cols.o extsort.o dist.o abuf.o \ ccall.o em.o prob1.o kmin.o PLUGIN_OBJS = vcfplugin.o @@ -229,6 +229,7 @@ ploidy_h = ploidy.h regidx.h prob1_h = prob1.h $(htslib_vcf_h) $(call_h) smpl_ilist_h = smpl_ilist.h $(htslib_vcf_h) vcfbuf_h = vcfbuf.h $(htslib_vcf_h) +abuf_h = abuf.h $(htslib_vcf_h) bam2bcf_h = bam2bcf.h $(htslib_hts_h) $(htslib_vcf_h) bam_sample_h = bam_sample.h $(htslib_sam_h) @@ -243,7 +244,7 @@ vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htsli vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h) vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h) vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h) -vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h +vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h) vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h @@ -280,6 +281,7 @@ version.o: version.h version.c hclust.o: hclust.c $(htslib_hts_h) $(htslib_kstring_h) $(bcftools_h) hclust.h HMM.o: HMM.c $(htslib_hts_h) HMM.h vcfbuf.o: vcfbuf.c $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(vcfbuf_h) rbuf.h +abuf.o: abuf.c $(htslib_vcf_h) $(bcftools_h) rbuf.h abuf.h extsort.o: extsort.c $(bcftools_h) extsort.h kheap.h smpl_ilist.o: smpl_ilist.c $(bcftools_h) $(smpl_ilist_h) csq.o: csq.c $(htslib_hts_h) $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_h) $(htslib_khash_str2int_h) $(htslib_kseq_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) regidx.h kheap.h $(smpl_ilist_h) rbuf.h diff --git a/NEWS b/NEWS index 8ec7e47d6..34c8ce87d 100644 --- a/NEWS +++ b/NEWS @@ -119,6 +119,13 @@ Changes affecting specific commands: - Add new optional tag `mpileup -a FORMAT/QS` +* bcftools norm: + + - New `-a, --atomize` functionality to decompose complex variants, + for example MNVs into consecutive SNVs + + - New option `--old-rec-tag` to indicate the original variant + * bcftools +prune: - New options --random-seed and --nsites-per-win-mode (#1050) diff --git a/abuf.c b/abuf.c new file mode 100644 index 000000000..57eb81748 --- /dev/null +++ b/abuf.c @@ -0,0 +1,665 @@ +/* The MIT License + + Copyright (c) 2021 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include "bcftools.h" +#include "abuf.h" +#include "rbuf.h" + +typedef struct +{ + kstring_t ref, alt; + int ial; // the index of the original ALT allele, 1-based + int beg, end; // 0-based inclusive offsets to ref,alt +} +atom_t; + +typedef struct +{ + bcf1_t *rec; + int nori, nout; // number of ALTs in the input, and VCF rows on output + uint8_t *tbl; // nori columns, nout rows + uint8_t *overlaps; // is the star allele needed for this variant? + atom_t **atoms; + int matoms, mtbl, moverlaps; + char *info_tag; +} +split_t; + +struct _abuf_t +{ + abuf_opt_t mode; + split_t split; + atom_t *atoms; + int natoms, matoms; + const bcf_hdr_t *hdr; + bcf_hdr_t *out_hdr; + bcf1_t **vcf; // dimensions stored in rbuf + rbuf_t rbuf; + + kstring_t tmps; + void *tmp, *tmp2; + int32_t *gt, *tmpi; + int ngt, mgt, ntmpi, mtmpi, mtmp, mtmp2; + int star_allele; +}; + +abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode) +{ + if ( mode!=SPLIT ) error("todo\n"); + abuf_t *buf = (abuf_t*) calloc(1,sizeof(abuf_t)); + buf->hdr = hdr; + buf->out_hdr = (bcf_hdr_t*) hdr; + buf->mode = mode; + buf->star_allele = 1; + rbuf_init(&buf->rbuf, 0); + return buf; +} + +void abuf_destroy(abuf_t *buf) +{ + int i; + for (i=0; imatoms; i++) + { + free(buf->atoms[i].ref.s); + free(buf->atoms[i].alt.s); + } + free(buf->atoms); + free(buf->split.atoms); + free(buf->split.overlaps); + free(buf->split.tbl); + for (i=0; irbuf.m; i++) + if ( buf->vcf[i] ) bcf_destroy(buf->vcf[i]); + free(buf->vcf); + free(buf->gt); + free(buf->tmpi); + free(buf->tmp); + free(buf->tmp2); + free(buf->tmps.s); + free(buf); +} + +void abuf_set(abuf_t *buf, abuf_opt_t key, void *value) +{ + if ( key==BCF_HDR ) { buf->out_hdr = *((bcf_hdr_t**)value); return; } + if ( key==INFO_TAG ) + { + buf->split.info_tag = *((char**)value); + bcf_hdr_printf(buf->out_hdr,"##INFO=",buf->split.info_tag); + return; + } + if ( key==STAR_ALLELE ) { buf->star_allele = *((int*)value); return; } +} + +/* + Split alleles into primitivs, e.g. + CC>TT becomes C>T,C>T + GCGT>GTGA becomes C>T,T>A + + There is no sequence alignment, just trimming and hungry matching + from left side. +*/ +static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) +{ + // Trim identical sequence from right + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[ial]; + int rlen = strlen(ref); + int alen = strlen(alt); + while ( rlen>1 && alen>1 && ref[rlen-1]==alt[alen-1] ) rlen--, alen--; + int Mlen = rlen > alen ? rlen : alen; + + atom_t *atom = NULL; + int i; + for (i=0; ialt); + if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; } + } + else + { + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + } + continue; + } + if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion? + { + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + } + } +} +static int _atoms_inconsistent(const atom_t *a, const atom_t *b) +{ + if ( a->beg < b->beg ) return -1; + if ( a->beg > b->beg ) return 1; + int rcmp = strcasecmp(a->ref.s,b->ref.s); + if ( rcmp ) return rcmp; + return strcasecmp(a->alt.s,b->alt.s); +} +/* + For reproducibility of tests on different platforms, we need to guarantee the same order of identical + atoms originating from different source ALTs. Even though they are consistent, different values can be + picked for VCF annotations as currently the values from the one that comes first are used. +*/ +static int _cmp_atoms(const void *aptr, const void *bptr) +{ + const atom_t *a = (const atom_t*) aptr; + const atom_t *b = (const atom_t*) bptr; + int rcmp = _atoms_inconsistent(a,b); + if ( rcmp ) return rcmp; + if ( a->ial < b->ial ) return -1; + if ( a->ial > b->ial ) return 1; + return 0; +} +static void _split_table_init(abuf_t *buf, bcf1_t *rec, int natoms) +{ + buf->split.rec = rec; + buf->split.nori = rec->n_allele - 1; + buf->split.nout = 0; + hts_expand(uint8_t,buf->split.nori*natoms,buf->split.mtbl,buf->split.tbl); + hts_expand(atom_t*,natoms,buf->split.matoms,buf->split.atoms); + hts_expand(uint8_t,natoms,buf->split.moverlaps,buf->split.overlaps); + memset(buf->split.overlaps,0,sizeof(*buf->split.overlaps)*natoms); +} +static void _split_table_new(abuf_t *buf, atom_t *atom) +{ + int i, iout = buf->split.nout++; + buf->split.atoms[iout] = atom; + uint8_t *ptr = buf->split.tbl + iout*buf->split.nori; + for (i=0; isplit.nori; i++) ptr[i] = 0; + ptr[atom->ial-1] = 1; +} +static void _split_table_overlap(abuf_t *buf, int iout, atom_t *atom) +{ + uint8_t *ptr = buf->split.tbl + iout*buf->split.nori; + ptr[atom->ial-1] = _atoms_inconsistent(atom,buf->split.atoms[iout]) ? 2 : 1; + buf->split.overlaps[iout] = 1; +} +#if 0 +static void _split_table_print(abuf_t *buf) +{ + int i,j; + for (i=0; isplit.nout; i++) + { + atom_t *atom = buf->split.atoms[i]; + uint8_t *ptr = buf->split.tbl + i*buf->split.nori; + fprintf(stderr,"%d\t%s\t%s",(int)buf->split.rec->pos+1+atom->beg,atom->ref.s,atom->alt.s); + for (j=0; jsplit.nori; j++) fprintf(stderr,"\t%d",(int)ptr[j]); + fprintf(stderr,"\n"); + } +} +static void _split_table_print_atoms(abuf_t *buf) +{ + int i; + for (i=0; inatoms; i++) + { + atom_t *atom = &buf->atoms[i]; + fprintf(stderr,"atom%d %p: ialt=%d %s>%s %d-%d\n",i,atom,atom->ial,atom->ref.s,atom->alt.s,atom->beg,atom->end); + } +} +#endif +static inline uint8_t _has_star_allele(abuf_t *buf, int iout) +{ + if ( !buf->star_allele ) return 0; + return buf->split.overlaps[iout]; +} +static inline int _split_table_get_ial(abuf_t *buf, int irow, int ial) +{ + if ( !ial ) return ial; + return buf->split.tbl[irow*buf->split.nori + ial - 1]; +} +static void _split_table_set_chrom_qual(abuf_t *buf) +{ + int iout,j; + bcf1_t *rec = buf->split.rec; + for (iout=0; ioutsplit.nout; iout++) + { + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + j = rbuf_append(&buf->rbuf); + if ( !buf->vcf[j] ) buf->vcf[j] = bcf_init1(); + bcf1_t *out = buf->vcf[j]; + bcf_clear1(out); + + atom_t *atom = buf->split.atoms[iout]; + out->rid = rec->rid; + out->pos = rec->pos + atom->beg; + bcf_update_id(buf->out_hdr, out, rec->d.id); + + const char *als[3]; + als[0] = atom->ref.s; + als[1] = atom->alt.s; + als[2] = "*"; + int nals = _has_star_allele(buf,iout) ? 3 : 2; + bcf_update_alleles(buf->out_hdr, out, als, nals); + + if ( bcf_float_is_missing(rec->qual) ) + bcf_float_set_missing(out->qual); + else + out->qual = rec->qual; + + bcf_update_filter(buf->out_hdr, out, rec->d.flt, rec->d.n_flt); + } +} +static void _split_table_set_info(abuf_t *buf, bcf_info_t *info) +{ + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,info->key); + int type = bcf_hdr_id2type(buf->hdr,BCF_HL_INFO,info->key); + int len = bcf_hdr_id2length(buf->hdr,BCF_HL_INFO,info->key); + if ( len==BCF_VL_G ) return; // todo: Number=G INFO tags + if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_LONG ) return; // todo: 64bit integers + + bcf1_t *rec = buf->split.rec; + int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/4 : buf->mtmp; + int nval = bcf_get_info_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*4; + + if ( (len==BCF_VL_A && nval != rec->n_allele - 1) || (len==BCF_VL_R && nval != rec->n_allele) ) + error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=INFO/%s Number=%c nAlleles=%d nValues=%d\n", + bcf_seqname(buf->hdr,rec),rec->pos+1,tag,len==BCF_VL_A?'A':'R',rec->n_allele,nval); + + if ( buf->mtmp2 < buf->mtmp ) + { + buf->tmp2 = realloc(buf->tmp2, buf->mtmp); + if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", buf->mtmp); + buf->mtmp2 = buf->mtmp; + } + + int32_t missing = bcf_int32_missing; + void *missing_ptr = (void*)&missing; + if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + + int iout; + for (iout=0; ioutsplit.nout; iout++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int star_allele = _has_star_allele(buf,iout); + int ret = 0; + if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp, nval, type); + else if ( len==BCF_VL_A ) + { + int iori = buf->split.atoms[iout]->ial - 1; + assert( ioritmp2,buf->tmp+4*iori,4); + if ( star_allele ) + memcpy(buf->tmp2+4,missing_ptr,4); + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 1 + star_allele, type); + } + else if ( len==BCF_VL_R ) + { + int iori = buf->split.atoms[iout]->ial; + assert( iori < nval ); + memcpy(buf->tmp2,buf->tmp,4); + memcpy(buf->tmp2+4,buf->tmp+4*iori,4); + if ( star_allele ) + memcpy(buf->tmp2+8,missing_ptr,4); + ret = bcf_update_info(buf->out_hdr, out, tag, buf->tmp2, 2 + star_allele, type); + } + if ( ret!=0 ) error("An error occurred while updating INFO/%s\n",tag); + } +} +static void _split_table_set_history(abuf_t *buf) +{ + int i,j; + bcf1_t *rec = buf->split.rec; + buf->tmps.l = 0; + ksprintf(&buf->tmps,"%s|%"PRIhts_pos"|%s|",bcf_seqname(buf->hdr,rec),rec->pos+1,rec->d.allele[0]); + for (i=1; in_allele; i++) + { + kputs(rec->d.allele[i],&buf->tmps); + if ( i+1n_allele ) kputc(',',&buf->tmps); + else kputc(',',&buf->tmps); + } + int len = buf->tmps.l; + buf->tmps.s[buf->tmps.l-1] = '|'; + + for (i=0; isplit.nout; i++) + { + buf->tmps.l = len; + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,i)]; + uint8_t *ptr = buf->split.tbl + i*buf->split.nori; + for (j=0; jsplit.nori; j++) + { + if ( ptr[j]!=1 ) continue; + kputw(j+1,&buf->tmps); + kputc(',',&buf->tmps); + } + buf->tmps.s[--buf->tmps.l] = 0; + if ( (bcf_update_info_string(buf->out_hdr, out, buf->split.info_tag, buf->tmps.s))!=0 ) + error("An error occurred while updating INFO/%s\n",buf->split.info_tag); + } +} +static void _split_table_set_gt(abuf_t *buf) +{ + int nsmpl = bcf_hdr_nsamples(buf->hdr); + if ( !nsmpl ) return; + + bcf1_t *rec = buf->split.rec; + buf->ngt = bcf_get_genotypes(buf->hdr, rec, &buf->gt, &buf->mgt); + if ( buf->ngt<=0 ) return; + else + hts_expand(int32_t,buf->ngt,buf->mtmpi,buf->tmpi); + + int iout,i,j; + for (iout=0; ioutsplit.nout; iout++) + { + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int star_allele = _has_star_allele(buf,iout); + int max_ploidy = buf->ngt/nsmpl; + int32_t *src = buf->gt, *dst = buf->tmpi; + for (i=0; i=rec->n_allele ) + error("Out-of-bounds genotypes at %s:%"PRIhts_pos"\n",bcf_seqname(buf->hdr,rec),rec->pos+1); + int ial = _split_table_get_ial(buf,iout,iori); + if ( ial==2 && !star_allele ) + dst[j] = bcf_gt_missing; + else + dst[j] = bcf_gt_is_phased(src[j]) ? bcf_gt_phased(ial) : bcf_gt_unphased(ial); + } + src += max_ploidy; + dst += max_ploidy; + } + bcf_update_genotypes(buf->out_hdr,out,buf->tmpi,buf->ngt); + } +} +static void _split_table_set_format(abuf_t *buf, bcf_fmt_t *fmt) +{ + int nsmpl = bcf_hdr_nsamples(buf->hdr); + if ( !nsmpl ) return; + + const char *tag = bcf_hdr_int2id(buf->hdr,BCF_DT_ID,fmt->id); + if ( tag[0]=='G' && tag[1]=='T' && !tag[2] ) // FORMAT/GT + { + _split_table_set_gt(buf); + return; + } + + int type = bcf_hdr_id2type(buf->hdr,BCF_HL_FMT,fmt->id); + int len = bcf_hdr_id2length(buf->hdr,BCF_HL_FMT,fmt->id); + if ( type==BCF_HT_STR && len!=BCF_VL_FIXED && len!=BCF_VL_VAR ) return; // todo: Number=A,R,G for strings + if ( type==BCF_HT_LONG ) return; // todo: 64bit integers + + const int num_size = 4; + assert( num_size==sizeof(int32_t) && num_size==sizeof(float) ); + int32_t missing = bcf_int32_missing; + void *missing_ptr = (void*)&missing; + if ( type==BCF_HT_REAL ) bcf_float_set_missing(*((float*)missing_ptr)); + + bcf1_t *rec = buf->split.rec; + int mtmp = ( type==BCF_HT_INT || type==BCF_HT_REAL ) ? buf->mtmp/num_size : buf->mtmp; + int nval = bcf_get_format_values(buf->hdr,rec,tag,&buf->tmp,&mtmp,type); + if ( type==BCF_HT_INT || type==BCF_HT_REAL ) buf->mtmp = mtmp*num_size; + + if ( len==BCF_VL_G && nval!=nsmpl*rec->n_allele && nval!=nsmpl*rec->n_allele*(rec->n_allele+1)/2 ) return; // not haploid nor diploid + + if ( (len==BCF_VL_A && nval != nsmpl*(rec->n_allele - 1)) || (len==BCF_VL_R && nval != nsmpl*rec->n_allele) ) + error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=%c nAlleles=%d nValues=%d\n", + bcf_seqname(buf->hdr,rec),rec->pos+1,tag,len==BCF_VL_A?'A':'R',rec->n_allele,nval); + + // Increase buffer size to accommodate star allele + mtmp = buf->mtmp; + if ( (len==BCF_VL_A || len==BCF_VL_R) && mtmp < num_size*(nval+nsmpl) ) mtmp = num_size*(nval+nsmpl); + else if ( len==BCF_VL_G && mtmp < num_size*(nval+nsmpl*3) ) mtmp = num_size*(nval+nsmpl*3); + + if ( buf->mtmp2 < mtmp ) + { + buf->tmp2 = realloc(buf->tmp2, mtmp); + if ( !buf->tmp2 ) error("Failed to alloc %d bytes\n", mtmp); + buf->mtmp2 = mtmp; + } + + int nval1 = nval / nsmpl; + int iout, i, j; + for (iout=0; ioutsplit.nout; iout++) + { + int star_allele = _has_star_allele(buf,iout); + bcf1_t *out = buf->vcf[rbuf_kth(&buf->rbuf,iout)]; + int ret = 0; + if ( len==BCF_VL_FIXED || len==BCF_VL_VAR ) + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp, nval, type); + else if ( len==BCF_VL_A ) + { + int iori = buf->split.atoms[iout]->ial - 1; + assert( ioritmp + nval1*num_size*i; + void *dst = buf->tmp2 + num_size*i*(star_allele+1); + memcpy(dst,src+iori*num_size,num_size); + if ( star_allele ) + memcpy(dst+num_size,missing_ptr,num_size); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+1), type); + } + else if ( len==BCF_VL_R ) + { + int iori = buf->split.atoms[iout]->ial - 1; + assert( ioritmp + nval1*num_size*i; + void *dst = buf->tmp2 + num_size*i*(star_allele+2); + memcpy(dst,src,num_size); + memcpy(dst+num_size,src+iori*num_size,num_size); + if ( star_allele ) + memcpy(dst+num_size*2,missing_ptr,num_size); + } + ret = bcf_update_format(buf->out_hdr, out, tag, buf->tmp2, nsmpl*(star_allele+2), type); + } + else if ( len==BCF_VL_G ) + { + int iori = buf->split.atoms[iout]->ial; + int i01 = bcf_alleles2gt(0,iori); + int i11 = bcf_alleles2gt(iori,iori); + assert( ioritmp + i*nval1; \ + type_t *dst = (type_t*)buf->tmp2 + i*3*(1+star_allele); \ + int n=0; /* determine ploidy of this genotype */ \ + while ( ntmp + i*nval1; \ + memcpy(dst++,src,sizeof(type)); \ + int nmiss = 0, nend = 0; \ + if ( n==rec->n_allele ) /* haploid */ \ + { \ + memcpy(dst++,src+iori,sizeof(type)); \ + if ( star_allele ) { nmiss = 1; nend = 3; } \ + else nend = 1; \ + } \ + else if ( n==nval1 ) \ + { \ + memcpy(dst++,src+i01,sizeof(type)); \ + memcpy(dst++,src+i11,sizeof(type)); \ + if ( star_allele ) nmiss = 3; \ + } \ + else if ( n==1 && is_missing ) \ + { \ + if ( star_allele ) nend = 5; \ + else nend = 2; \ + } \ + else \ + error("Incorrect number of values at %s:%"PRIhts_pos" .. tag=FORMAT/%s Number=G nAlleles=%d nValues=%d, %d-th sample\n", \ + bcf_seqname(buf->hdr,rec),rec->pos+1,tag,rec->n_allele,n,i+1); \ + for (j=0; jout_hdr, out, tag, buf->tmp2, 3*(1+star_allele)*nsmpl, type); + } + if ( ret!=0 ) error("An error occurred while updating FORMAT/%s\n",tag); + } +} +static inline int _is_acgtn(char *seq) +{ + while ( *seq ) + { + char c = toupper(*seq); + if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 0; + seq++; + } + return 1; +} +/* + The atomization works as follows: + - Atomize each alternate allele separately by leaving out sequence identical to the reference. No + alignment is performed, just greedy trimming of the end, then from left. This operation returns + a list of atoms (atom_t) which carry fragments of REF,ALT and their positions as 0-based offsets + to the original REF allele + - Sort atoms by POS, REF and ALT. Each unique atom (POS+REF+ALT) forms a new VCF record, each + with a single ALT. + - For each new VCF record determine how to translate the original allele index (iori) to this new + record: + - 1: the original allele matches the atom + - 0: the original allele does not overlap this atom or the overlapping part matches the REF + allele + - 2 (or equivalently "."): there is a mismatch between the original allele and the atom + The mapping is encoded in a table with columns corresponding to the original ALTs and rows + to the new POS+ALTs (atoms). The table is initialized to 0, then we set 1's for matching + atoms and 2's for overlapping mismatching atoms. + + Note that different ALT alleles can result in the same atom (the same output line) and this code + does not know how to reconcile possibly conflicting VCF annotations. This could be improved + and merge logic provided, similarly to `merge -l`. For example, the allelic depths (AD) should + be summed for the same atomized output allele. However, this level of complexity is not addressed + in this initial draft. Higher priority for now is to provide the inverse "join" operation. +*/ +void _abuf_split(abuf_t *buf, bcf1_t *rec) +{ + int i,j; + for (i=1; in_allele; i++) + { + if ( _is_acgtn(rec->d.allele[i]) ) continue; + rbuf_expand0(&buf->rbuf, bcf1_t*, buf->rbuf.n+1, buf->vcf); + int j = rbuf_append(&buf->rbuf); + if ( buf->vcf[j] ) bcf_destroy(buf->vcf[j]); + buf->vcf[j] = bcf_dup(rec); + return; + } + + buf->natoms = 0; + for (i=1; in_allele; i++) _atomize_allele(buf,rec,i); + qsort(buf->atoms,buf->natoms,sizeof(*buf->atoms),_cmp_atoms); + _split_table_init(buf,rec,buf->natoms); + for (i=0; inatoms; i++) + { + if ( i && !_atoms_inconsistent(&buf->atoms[i-1],&buf->atoms[i]) ) continue; + _split_table_new(buf, &buf->atoms[i]); // add a new unique output atom + } + for (i=0; inatoms; i++) + { + // Looping over sorted list of all atoms with possible duplicates from different source ALT alleles + atom_t *atom = &buf->atoms[i]; + for (j=0; jsplit.nout; j++) + { + atom_t *out = buf->split.atoms[j]; + if ( atom == out ) continue; // table already set to 1 + if ( atom->beg > out->end ) continue; // cannot overlap this output atom + if ( atom->end < out->beg ) break; // this atom is ahead of all subsequent output records + _split_table_overlap(buf, j, atom); + } + } + assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode + + // Create the output records, transferring all annotations: + // CHROM-QUAL + _split_table_set_chrom_qual(buf); + + // INFO + for (i=0; in_info; i++) + _split_table_set_info(buf, &rec->d.info[i]); + + // Set INFO tag with the original result + if ( buf->split.info_tag ) + _split_table_set_history(buf); + + // FORMAT + for (i=0; in_fmt; i++) + _split_table_set_format(buf, &rec->d.fmt[i]); +} + +void abuf_push(abuf_t *buf, bcf1_t *rec) +{ + bcf_unpack(rec, BCF_UN_ALL); + if ( buf->mode==SPLIT ) _abuf_split(buf,rec); +} + +bcf1_t *abuf_flush(abuf_t *buf, int flush_all) +{ + int i; + + if ( buf->rbuf.n==0 ) return NULL; + if ( flush_all ) goto ret; + +ret: + i = rbuf_shift(&buf->rbuf); + return buf->vcf[i]; +} + diff --git a/abuf.h b/abuf.h new file mode 100644 index 000000000..5fc1e0099 --- /dev/null +++ b/abuf.h @@ -0,0 +1,78 @@ +/* The MIT License + + Copyright (c) 2021 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +/* + Atomize/deatomize complex variants +*/ + +#ifndef __ABUF_H__ +#define __ABUF_H__ + +#include + +typedef struct _abuf_t abuf_t; + +// Modes of operation +typedef enum +{ + NONE, + + // mode of operation, to be passed to abuf_init + SPLIT, + JOIN, + + BCF_HDR, // should the records be annotated, a writable bcf header is required + INFO_TAG, // set BCF_HDR first + STAR_ALLELE // 1: use STAR allele (the default), 0: set overlaps to missing +} +abuf_opt_t; + +#define abuf_set_opt(buf,type,key,value) { type tmp = value; abuf_set(buf, key, (void*)&tmp); } +void abuf_set(abuf_t *buf, abuf_opt_t key, void *value); + +/* + * abuf_init() - init buffer + * @win: number of sites (>0) or bp (<0) + */ +abuf_t *abuf_init(const bcf_hdr_t *hdr, abuf_opt_t mode); +void abuf_destroy(abuf_t *buf); + +/* + * abuf_push() - Push a new site for analysis + */ +void abuf_push(abuf_t *buf, bcf1_t *rec); + +/* + * abuf_flush() - Return next buffered record + * @flush_all: Set to 1 if no more overlapping records are coming (e.g. end of chromosome or end of file), + * the buffer can be emptied. + * return: The next atomized/deatomized VCF record or NULL if no record is ready. The returned + * structure will be cleaned by abuf. + */ +bcf1_t *abuf_flush(abuf_t *buf, int flush_all); + +#endif + diff --git a/doc/bcftools.1 b/doc/bcftools.1 index 0fcecdf65..6cac6e4db 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -1,13 +1,13 @@ '\" t .\" Title: bcftools .\" Author: [see the "AUTHORS" section] -.\" Generator: DocBook XSL Stylesheets v1.76.1 -.\" Date: 2020-11-25 16:08 GMT +.\" Generator: DocBook XSL Stylesheets vsnapshot +.\" Date: 2021-02-23 10:44 CET .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2020\-11\-25 16:08 GMT" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2021\-02\-23 10:44 CET" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Define some portability stuff .\" ----------------------------------------------------------------- @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&. .SS "VERSION" .sp -This manual page was last updated \fB2020\-11\-25 16:08 GMT\fR and refers to bcftools git version \fB1\&.11\-24\-g9718479+\fR\&. +This manual page was last updated \fB2021\-02\-23 10:44 CET\fR and refers to bcftools git version \fB1\&.2\-1248\-g3910e40+\fR\&. .SS "BCF1" .sp The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&. @@ -70,7 +70,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBannotate\fR \&.\&. edit VCF files, add or remove annotations .RE @@ -83,7 +82,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBcall\fR \&.\&. SNP/indel calling (former "view") .RE @@ -96,7 +94,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBcnv\fR \&.\&. Copy Number Variation caller .RE @@ -109,7 +106,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBconcat\fR \&.\&. concatenate VCF/BCF files from the same set of samples .RE @@ -122,7 +118,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBconsensus\fR \&.\&. create consensus sequence by applying VCF variants .RE @@ -135,7 +130,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBconvert\fR \&.\&. convert VCF/BCF to other formats and back .RE @@ -148,7 +142,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBcsq\fR \&.\&. haplotype aware consequence caller .RE @@ -161,7 +154,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBfilter\fR \&.\&. filter VCF/BCF files using fixed thresholds .RE @@ -174,7 +166,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBgtcheck\fR \&.\&. check sample concordance, detect sample swaps and contamination .RE @@ -187,7 +178,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBindex\fR \&.\&. index VCF/BCF .RE @@ -200,7 +190,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBisec\fR \&.\&. intersections of VCF/BCF files .RE @@ -213,7 +202,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBmerge\fR \&.\&. merge VCF/BCF files files from non\-overlapping sample sets .RE @@ -226,7 +214,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBmpileup\fR \&.\&. multi\-way pileup producing genotype likelihoods .RE @@ -239,7 +226,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBnorm\fR \&.\&. normalize indels .RE @@ -252,7 +238,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBplugin\fR \&.\&. run user\-defined plugin .RE @@ -265,7 +250,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBpolysomy\fR \&.\&. detect contaminations and whole\-chromosome aberrations .RE @@ -278,7 +262,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBquery\fR \&.\&. transform VCF/BCF into user\-defined formats .RE @@ -291,7 +274,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBreheader\fR \&.\&. modify VCF/BCF header, change sample names .RE @@ -304,7 +286,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBroh\fR \&.\&. identify runs of homo/auto\-zygosity .RE @@ -317,7 +298,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBsort\fR \&.\&. sort VCF/BCF files .RE @@ -330,7 +310,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBstats\fR \&.\&. produce VCF/BCF stats (former vcfcheck) .RE @@ -343,7 +322,6 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} - \fBview\fR \&.\&. subset, filter and convert VCF and BCF files .RE @@ -359,7 +337,6 @@ Some helper scripts are bundled with the bcftools code\&. .sp -1 .IP \(bu 2.3 .\} - \fBplot\-vcfstats\fR \&.\&. plots the output of \fBstats\fR @@ -1383,7 +1360,7 @@ is true\&. For valid expressions see reference sequence in fasta format .RE .PP -\fB\-H, \-\-haplotype\fR \fI1\fR|\fI2\fR|\fIR\fR|\fIA\fR|\fILR\fR|\fILA\fR|\fISR\fR|\fISA\fR|\fI1pIu\fR|\fI2pIu\fR +\fB\-H, \-\-haplotype\fR \fI1\fR|\fI2\fR|\fIR\fR|\fIA\fR|\fII\fR|\fILR\fR|\fILA\fR|\fISR\fR|\fISA\fR|\fI1pIu\fR|\fI2pIu\fR .RS 4 choose which allele from the FORMAT/GT field to use (the codes are case\-insensitive): .PP @@ -1407,6 +1384,11 @@ the REF allele (in heterozygous genotypes) the ALT allele (in heterozygous genotypes) .RE .PP +\fII\fR +.RS 4 +IUPAC code for all genotypes +.RE +.PP \fILR, LA\fR .RS 4 the longer allele\&. If both have the same length, use the REF allele (LR), or the ALT allele (LA) @@ -1446,15 +1428,39 @@ is true\&. For valid expressions see output variants in the form of IUPAC ambiguity codes .RE .PP +\fB\-\-mark\-del\fR \fICHAR\fR +.RS 4 +instead of removing sequence, insert CHAR for deletions +.RE +.PP +\fB\-\-mark\-ins\fR \fIuc\fR|\fIlc\fR +.RS 4 +highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is +.RE +.PP +\fB\-\-mark\-snv\fR \fIuc\fR|\fIlc\fR +.RS 4 +highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is +.RE +.PP \fB\-m, \-\-mask\fR \fIFILE\fR .RS 4 -BED file or TAB file with regions to be replaced with N\&. See discussion of +BED file or TAB file with regions to be replaced with N (the default) or as specified by the next +\fB\-\-mask\-with\fR +option\&. See discussion of \fB\-\-regions\-file\fR in \fBCommon Options\fR for file format details\&. .RE .PP +\fB\-\-mask\-with\fR \fICHAR\fR|\fIlc\fR|\fIuc\fR +.RS 4 +replace sequence from +\fB\-\-mask\fR +with CHAR, skipping overlapping variants, or change to lowercase (lc) or uppercase (uc) +.RE +.PP \fB\-M, \-\-missing\fR \fICHAR\fR .RS 4 instead of skipping the missing genotypes, output the character CHAR (e\&.g\&. "?") @@ -2793,7 +2799,7 @@ Disable probabilistic realignment for the computation of base alignment quality .PP \fB\-C, \-\-adjust\-MQ\fR \fIINT\fR .RS 4 -Coefficient for downgrading mapping quality for reads containing excessive mismatches\&. Given a read with a phred\-scaled probability q of being generated from the mapped position, the new mapping quality is about sqrt((INT\-q)/INT)*INT\&. A zero value disables this functionality; if enabled, the recommended value for BWA is 50\&. [0] +Coefficient for downgrading mapping quality for reads containing excessive mismatches\&. Given a read with a phred\-scaled probability q of being generated from the mapped position, the new mapping quality is about sqrt((INT\-q)/INT)*INT\&. A zero value (the default) disables this functionality\&. .RE .PP \fB\-d, \-\-max\-depth\fR \fIINT\fR @@ -3108,6 +3114,35 @@ Call SNPs and short INDELs, then mark low quality sites and sites with the read .sp Left\-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows\&. Left\-alignment and normalization will only be applied if the \fB\-\-fasta\-ref\fR option is supplied\&. .PP +\fB\-a, \-\-atomize\fR \fI\&.\fR|\fI*\fR +.RS 4 +Decompose complex variants (e\&.g\&. split MNVs into consecutive SNVs)\&. Alleles missing because of an overlapping variant can be set either to missing (\&.) or to the star alele (*), as recommended by the VCF specification\&. IMPORTANT: Note that asterisk is expaneded by shell and must be put in quotes or escaped by a backslash: +.RE +.sp +.if n \{\ +.RS 4 +.\} +.nf + # Before atomization: + 100 CC C,GG 1/2 + + # After: + # bcftools norm \-a \&. + 100 C G \&./1 + 100 CC C 1/\&. + 101 C G \&./1 + + # After: + # bcftools norm \-a \*(Aq*\*(Aq + # bcftools norm \-a \e* + 100 C G,* 2/1 + 100 CC C,* 1/2 + 101 C G,* 2/1 +.fi +.if n \{\ +.RE +.\} +.PP \fB\-c, \-\-check\-ref\fR \fIe\fR|\fIw\fR|\fIx\fR|\fIs\fR .RS 4 what to do when incorrect or missing REF allele is encountered: exit (\fIe\fR), warn (\fIw\fR), exclude (\fIx\fR), or set/fix (\fIs\fR) bad sites\&. The @@ -3744,7 +3779,7 @@ for more\&. convert between similar tags, such as GL and GP .RE .PP -\fBtrio\-dnm\fR +\fBtrio\-dnm2\fR .RS 4 screen variants for possible de\-novo mutations in trios .RE diff --git a/doc/bcftools.html b/doc/bcftools.html index 95b873b91..8923e0a10 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -1,6 +1,5 @@ - -bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant +bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.

Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF @@ -10,17 +9,17 @@ (Note that files with non-standard index names can be accessed as e.g. "bcftools view -r X:2928329 file.vcf.gz##idx##non-standard-index-name".)

BCFtools is designed to work on a stream. It regards an input file "-" as the standard input (stdin) and outputs to the standard output (stdout). Several -commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2020-11-25 16:08 GMT and refers to bcftools git version 1.11-24-g9718479+.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not +commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2021-02-23 10:44 CET and refers to bcftools git version 1.2-1248-g3910e40+.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by -this version of bcftools.

    samtools-0.1.19/bcftools/bcftools view file.bcf1 | bcftools view

VARIANT CALLING

See bcftools call for variant calling from the output of the +this version of bcftools.

    samtools-0.1.19/bcftools/bcftools view file.bcf1 | bcftools view

VARIANT CALLING

See bcftools call for variant calling from the output of the samtools mpileup command. In versions of samtools <= 0.1.19 calling was done with bcftools view. Users are now required to choose between the old samtools calling model (-c/--consensus-caller) and the new multiallelic calling model (-m/--multiallelic-caller). The multiallelic calling model -is recommended for most tasks.

LIST OF COMMANDS

For a full list of available commands, run bcftools without arguments. For a full -list of available options, run bcftools COMMAND without arguments.

  • +is recommended for most tasks.

LIST OF COMMANDS

For a full list of available commands, run bcftools without arguments. For a full +list of available options, run bcftools COMMAND without arguments.

  • annotate .. edit VCF files, add or remove annotations
  • call .. SNP/indel calling (former "view") @@ -64,10 +63,10 @@ stats .. produce VCF/BCF stats (former vcfcheck)
  • view .. subset, filter and convert VCF and BCF files -

LIST OF SCRIPTS

Some helper scripts are bundled with the bcftools code.

  • +

LIST OF SCRIPTS

Some helper scripts are bundled with the bcftools code.

COMMANDS AND OPTIONS

Common Options

The following options are common to many bcftools commands. See usage for -specific commands to see if they apply.

+

COMMANDS AND OPTIONS

Common Options

The following options are common to many bcftools commands. See usage for +specific commands to see if they apply.

FILE
Files can be both VCF or BCF, uncompressed or BGZF-compressed. The file "-" @@ -83,7 +82,7 @@ matching positions (bcftools isec -c all), or only sites with matching variant type (bcftools isec -c snps  -c indels), or only sites with all alleles identical (bcftools isec -c none). -

+

none
only records with identical REF and ALT alleles are compatible @@ -181,7 +180,7 @@ option is used; see bcftools view documentation). To use updated tags for the subset in another command one can pipe from view into that command. For example: -
    bcftools view -Ou -s sample1,sample2 file.vcf | bcftools query -f %INFO/AC\t%INFO/AN\n
+
    bcftools view -Ou -s sample1,sample2 file.vcf | bcftools query -f %INFO/AC\t%INFO/AN\n
-S, --samples-file FILE
File of sample names to include or exclude if prefixed with "^". @@ -198,7 +197,7 @@ sample3 F

If the second column is not present, the sex "F" is assumed. With bcftools call -C trio, PED file is expected. The program ignores the first column and the last indicates sex (1=male, 2=female), for example:

    ignored_column  daughterA fatherA  motherA  2
-    ignored_column  sonB      fatherB  motherB  1
+ ignored_column sonB fatherB motherB 1
-t, --targets [^]chr|chr:pos|chr:from-to|chr:from-[,…]
Similar as -r, --regions, but the next position is accessed by streaming the @@ -222,12 +221,12 @@ be comma-separated list of alleles, starting with the reference allele. Note that the file must be compressed and index. Such a file can be easily created from a VCF using: -
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' file.vcf | bgzip -c > als.tsv.gz && tabix -s1 -b2 -e2 als.tsv.gz
+
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' file.vcf | bgzip -c > als.tsv.gz && tabix -s1 -b2 -e2 als.tsv.gz
--threads INT
Use multithreading with INT worker threads. The option is currently used only for the compression of the output stream, only when --output-type is b or z. Default: 0. -

bcftools annotate [OPTIONS] FILE

Add or remove annotations.

+

bcftools annotate [OPTIONS] FILE

Add or remove annotations.

-a, --annotations file
Bgzip-compressed and tabix-indexed file with annotations. The file @@ -251,7 +250,7 @@
    # Sample annotation file with columns CHROM, POS, STRING_TAG, NUMERIC_TAG
     1  752566  SomeString      5
     1  798959  SomeOtherString 6
-    # etc.
+ # etc.
--collapse snps|indels|both|all|some|none
Controls how to match records from the annotation file to the target VCF. @@ -309,14 +308,14 @@
Lines to append to the VCF header, see also -c, --columns and -a, --annotations. For example:
    ##INFO=<ID=NUMERIC_TAG,Number=1,Type=Integer,Description="Example header line">
-    ##INFO=<ID=STRING_TAG,Number=1,Type=String,Description="Yet another header line">
+ ##INFO=<ID=STRING_TAG,Number=1,Type=String,Description="Yet another header line">
-I, --set-id [+]FORMAT
assign ID on the fly. The format is the same as in the query command (see below). By default all existing IDs are replaced. If the format string is preceded by "+", only missing IDs will be set. For example, one can use -
    bcftools annotate --set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' file.vcf
+
    bcftools annotate --set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' file.vcf
-i, --include EXPRESSION
include only sites for which EXPRESSION is true. For valid expressions see @@ -432,10 +431,10 @@ # Annotate from a bed file (0-based coordinates, half-closed, half-open intervals) bcftools annotate -a annots.bed.gz -h annots.hdr -c CHROM,FROM,TO,TAG input.vcf - # For more examples see http://samtools.github.io/bcftools/howtos/annotate.html

bcftools call [OPTIONS] FILE

This command replaces the former bcftools view caller. Some of the original + # For more examples see http://samtools.github.io/bcftools/howtos/annotate.html

bcftools call [OPTIONS] FILE

This command replaces the former bcftools view caller. Some of the original functionality has been temporarily lost in the process of transition under htslib, but will be added back on popular -demand. The original calling model can be invoked with the -c option.

File format options:

+demand. The original calling model can be invoked with the -c option.

File format options:

--no-version
see Common Options @@ -468,7 +467,7 @@ MT 1 16569 M 1 MT 1 16569 F 1 * * * M 2 - * * * F 2
+ * * * F 2
-r, --regions chr|chr:pos|chr:from-to|chr:from-[,…]
see Common Options @@ -496,7 +495,7 @@ --threads INT
see Common Options -

Input/output options:

+

Input/output options:

-A, --keep-alts
output all alternate alleles present in the alignments even if they do not @@ -526,7 +525,7 @@ ##INFO=<ID=REF_AC,Number=A,Type=Integer,Description="Allele count in reference genotypes for each ALT allele"> # Now before calling, stream the raw mpileup output through `bcftools annotate` to add the frequencies - bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]
+ bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]
-G, --group-samples FILE|-
by default, all samples are assumed to come from a single population. This option allows to group samples @@ -557,13 +556,13 @@ -v, --variants-only
output variant sites only -

Consensus/variant calling options:

+

Consensus/variant calling options:

-c, --consensus-caller
the original samtools/bcftools calling method (conflicts with -m)
-C, --constrain alleles|trio -
+
alleles
call genotypes given alleles. See also -T, --targets-file. @@ -611,10 +610,10 @@ -Y, --chromosome-Y
haploid output for males and skips females (requires PED file with -s) -

bcftools cnv [OPTIONS] FILE

Copy number variation caller, requires a VCF annotated with the Illumina’s +

bcftools cnv [OPTIONS] FILE

Copy number variation caller, requires a VCF annotated with the Illumina’s B-allele frequency (BAF) and Log R Ratio intensity (LRR) values. The HMM considers the following copy number states: CN 2 (normal), 1 (single-copy -loss), 0 (complete loss), 3 (single-copy gain).

General Options:

+loss), 0 (complete loss), 3 (single-copy gain).

General Options:

-c, --control-sample string
optional control sample name. If given, pairwise calling is performed @@ -653,7 +652,7 @@ -T, --targets-file FILE
see Common Options -

HMM Options:

+

HMM Options:

-a, --aberrant float[,float]
fraction of aberrant cells in query and control. The hallmark of @@ -702,13 +701,13 @@
the HMM probability of transition to another copy number state. Increasing this values leads to smaller and more frequent calls. -

bcftools concat [OPTIONS] FILE1 FILE2 […]

Concatenate or combine VCF/BCF files. All source files must have the same sample +

bcftools concat [OPTIONS] FILE1 FILE2 […]

Concatenate or combine VCF/BCF files. All source files must have the same sample columns appearing in the same order. Can be used, for example, to concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel VCF into one. The input files must be sorted by chr and position. The files must be given in the correct order to produce sorted VCF on output unless the -a, --allow-overlaps option is specified. With the --naive option, the files -are concatenated without being recompressed, which is very fast..

+are concatenated without being recompressed, which is very fast..

-a, --allow-overlaps
First coordinate of the next file can precede last record of the current file. @@ -775,13 +774,13 @@ --threads INT
see Common Options -

bcftools consensus [OPTIONS] FILE

Create consensus sequence by applying VCF variants to a reference fasta file. +

bcftools consensus [OPTIONS] FILE

Create consensus sequence by applying VCF variants to a reference fasta file. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence. Using the --sample (and, optionally, --haplotype) option will apply genotype (haplotype) calls from FORMAT/GT. Note that the program does not act as a primitive variant caller and ignores allelic depth information, such as INFO/AD or FORMAT/AD. For that, consider using the -setGT plugin.

+setGT plugin.

-c, --chain FILE
write a chain file for liftover @@ -795,10 +794,10 @@
reference sequence in fasta format
--H, --haplotype 1|2|R|A|LR|LA|SR|SA|1pIu|2pIu +-H, --haplotype 1|2|R|A|I|LR|LA|SR|SA|1pIu|2pIu

choose which allele from the FORMAT/GT field to use (the codes are case-insensitive): -

+

1
the first allele, regardless of phasing @@ -815,6 +814,10 @@
the ALT allele (in heterozygous genotypes)
+I +
+ IUPAC code for all genotypes +
LR, LA
the longer allele. If both have the same length, use the REF allele (LR), or the ALT allele (LA) @@ -836,12 +839,29 @@
output variants in the form of IUPAC ambiguity codes
+--mark-del CHAR +
+ instead of removing sequence, insert CHAR for deletions +
+--mark-ins uc|lc +
+ highlight inserted sequence in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is +
+--mark-snv uc|lc +
+ highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest of the sequence as is +
-m, --mask FILE
- BED file or TAB file with regions to be replaced with N. See discussion + BED file or TAB file with regions to be replaced with N (the default) or as specified by + the next --mask-with option. See discussion of --regions-file in Common Options for file format details.
+--mask-with CHAR|lc|uc +
+ replace sequence from --mask with CHAR, skipping overlapping variants, or change to lowercase (lc) or uppercase (uc) +
-M, --missing CHAR
instead of skipping the missing genotypes, output the character CHAR (e.g. "?") @@ -858,7 +878,7 @@ # Create consensus for one region. The fasta header lines are then expected # in the form ">chr:from-to". - samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz -o out.fa

bcftools convert [OPTIONS] FILE

VCF input options:

+ samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz -o out.fa

bcftools convert [OPTIONS] FILE

VCF input options:

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -892,7 +912,7 @@ -T, --targets-file FILE
see Common Options -

VCF output options:

+

VCF output options:

--no-version
see Common Options @@ -908,7 +928,7 @@ --threads INT
see Common Options -

GEN/SAMPLE conversion:

+

GEN/SAMPLE conversion:

-G, --gensample2vcf prefix or gen-file,sample-file
convert IMPUTE2 output to VCF. The second column must be of the form @@ -933,7 +953,7 @@ ID_1 ID_2 missing 0 0 0 sample1 sample1 0 - sample2 sample2 0
+ sample2 sample2 0
--tag STRING
tag to take values for .gen file: GT,PL,GL,GP @@ -946,11 +966,11 @@
output sex column in the sample file. The FILE format is
    MaleSample    M
-    FemaleSample  F
+ FemaleSample F
--vcf-ids
output VCF IDs in the second column instead of CHROM:POS_REF_ALT -

gVCF conversion:

+

gVCF conversion:

--gvcf2vcf
convert gVCF to VCF, expanding REF blocks into sites. Note that @@ -962,7 +982,7 @@ -f, --fasta-ref file
reference sequence in fasta format. Must be indexed with samtools faidx -

HAP/SAMPLE conversion:

+

HAP/SAMPLE conversion:

--hapsample2vcf prefix or hap-file,sample-file
convert from hap/sample format to VCF. The columns of .hap file are @@ -974,7 +994,7 @@ ---- 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 - 1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0
+ 1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0
--hapsample prefix or hap-file,sample-file
convert from VCF to hap/sample format used by IMPUTE2 and SHAPEIT. @@ -993,11 +1013,11 @@
output sex column in the sample file. The FILE format is
    MaleSample    M
-    FemaleSample  F
+ FemaleSample F
--vcf-ids
output VCF IDs instead of "CHROM:POS_REF_ALT" IDs -

HAP/LEGEND/SAMPLE conversion:

+

HAP/LEGEND/SAMPLE conversion:

-H, --haplegendsample2vcf prefix or hap-file,legend-file,sample-file
convert from hap/legend/sample format used by IMPUTE2 to VCF, see @@ -1025,7 +1045,7 @@ ------- sample population group sex sample1 sample1 sample1 2 - sample2 sample2 sample2 2
+ sample2 sample2 sample2 2
--haploid2diploid
with -h option converts haploid genotypes to homozygous diploid @@ -1037,11 +1057,11 @@
output sex column in the sample file. The FILE format is
    MaleSample    M
-    FemaleSample  F
+ FemaleSample F
--vcf-ids
output VCF IDs instead of "CHROM:POS_REF_ALT" IDs -

TSV conversion:

+

TSV conversion:

--tsv2vcf file
convert from TSV (tab-separated values) format (such as generated by @@ -1070,7 +1090,7 @@
file of sample names. See Common Options

Example:

# Convert 23andme results into VCF
-bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz

bcftools csq [OPTIONS] FILE

Haplotype aware consequence predictor which correctly handles combined +bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz

bcftools csq [OPTIONS] FILE

Haplotype aware consequence predictor which correctly handles combined variants such as MNPs split over multiple VCF records, SNPs separated by an intron (but adjacent in the spliced transcript) or nearby frame-shifting indels which in combination in fact are not frame-shifting.

The output VCF is annotated with INFO/BCSQ and FORMAT/BCSQ tag (configurable @@ -1088,7 +1108,7 @@ with the --local-csq option.

If conflicting (overlapping) variants within one haplotype are detected, a warning will be emitted and predictions will be based on only the first variant in the analysis.

Symbolic alleles are not supported. They will remain unannotated in the -output VCF and are ignored for the prediction analysis.

+output VCF and are ignored for the prediction analysis.

-c, --custom-tag STRING
use this custom tag to store consequences rather than the default BCSQ tag @@ -1144,7 +1164,7 @@ 1 ignored_field three_prime_UTR 21 2054 . - . Parent=transcript:TranscriptId 1 ignored_field exon 21 2148 . - . Parent=transcript:TranscriptId 1 ignored_field CDS 21 2148 . - 1 Parent=transcript:TranscriptId - 1 ignored_field five_prime_UTR 210 2148 . - . Parent=transcript:TranscriptId
+ 1 ignored_field five_prime_UTR 210 2148 . - . Parent=transcript:TranscriptId
-i, --include EXPRESSION
include only sites for which EXPRESSION is true. For valid expressions see @@ -1178,7 +1198,7 @@ -p, --phase a|m|r|R|s

how to handle unphased heterozygous genotypes: -

+

a
take GTs as is, create haplotypes regardless of phase (0/1 → 0|1) @@ -1257,7 +1277,7 @@ BCSQ=stop_gained|C2orf83|ENST00000264387|-|141W>141*|228476140C>T # The consequence type of a variant downstream from a stop are prefixed with * - BCSQ=*missense|PER3|ENST00000361923|+|1028M>1028T|7890117T>C

bcftools filter [OPTIONS] FILE

Apply fixed-threshold filters.

+ BCSQ=*missense|PER3|ENST00000361923|+|1028M>1028T|7890117T>C

bcftools filter [OPTIONS] FILE

Apply fixed-threshold filters.

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -1275,7 +1295,7 @@ Here the positions 1 and 6 are filtered, 0 and 7 are not: 0123-456789 ref .G.G-..G.. - ins .A.GT..A..
+ ins .A.GT..A..
-G, --IndelGap INT
filter clusters of indels separated by INT or fewer base pairs allowing @@ -1288,7 +1308,7 @@ And similarly here, the second is filtered: 01 23 456 78 ref .A-.A-..A-.. - ins .AT.AT..AT..
+ ins .AT.AT..AT..
-i, --include EXPRESSION
include only sites for which EXPRESSION is true. For valid expressions see @@ -1343,10 +1363,10 @@ --threads INT
see Common Options -

bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz

Checks sample identity. The program can operate in two modes. If the -g +

bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz

Checks sample identity. The program can operate in two modes. If the -g option is given, the identity of samples from query.vcf.gz is checked against the samples in the -g file. -Without the -g option, multi-sample cross-check of samples in query.vcf.gz is performed.

+Without the -g option, multi-sample cross-check of samples in query.vcf.gz is performed.

--distinctive-sites NUM[,MEM[,DIR]]
Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1, @@ -1409,7 +1429,7 @@ List of query samples or -g samples. If neither -s nor -S are given, all possible sample pair combinations are compared

-S, --samples-file [qry|gt]:'FILE' File with the query or -g samples to compare. If neither -s nor -S are given, all possible sample - pair combinations are compared

+ pair combinations are compared

-t, --targets file
see Common Options @@ -1430,13 +1450,13 @@ bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf # Compare only two pairs a1,b1 and a1,b2 - bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf

bcftools index [OPTIONS] in.bcf|in.vcf.gz

Creates index for bgzip compressed VCF/BCF files for random access. CSI + bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf

bcftools index [OPTIONS] in.bcf|in.vcf.gz

Creates index for bgzip compressed VCF/BCF files for random access. CSI (coordinate-sorted index) is created by default. The CSI format supports indexing of chromosomes up to length 2^31. TBI (tabix index) index files, which support chromosome lengths up to 2^29, can be created by using the -t/--tbi option or using the tabix program packaged with htslib. When loading an index file, bcftools will try -the CSI first and then the TBI.

Indexing options:

+the CSI first and then the TBI.

Indexing options:

-c, --csi
generate CSI-format index for VCF/BCF files [default] @@ -1461,7 +1481,7 @@ --threads INT
see Common Options -

Stats options:

+

Stats options:

-n, --nrecords
print the number of records based on the CSI or TBI index files @@ -1472,10 +1492,10 @@ Output format is three tab-delimited columns listing the contig name, contig length (. if unknown) and number of records for the contig. Contigs with zero records are not printed. -

bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz […]

Creates intersections, unions and complements of VCF files. Depending +

bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz […]

Creates intersections, unions and complements of VCF files. Depending on the options, the program can output records from one (or more) files which have (or do not have) corresponding records with the same position -in the other files.

+in the other files.

-c, --collapse snps|indels|both|all|some|none
see Common Options @@ -1540,9 +1560,9 @@
list of input files to output given as 1-based indices. With -p and no -w, all files are written. -

Examples:

Create intersection and complements of two sets saving the output in dir/*

    bcftools isec -p dir A.vcf.gz B.vcf.gz

Filter sites in A (require INFO/MAF>=0.01) and B (require INFO/dbSNP) but not in C, +

Examples:

Create intersection and complements of two sets saving the output in dir/*

    bcftools isec -p dir A.vcf.gz B.vcf.gz

Filter sites in A (require INFO/MAF>=0.01) and B (require INFO/dbSNP) but not in C, and create an intersection, including only sites which appear in at least two of -the files after filters have been applied

    bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -n +2 -p dir

Extract and write records from A shared by both A and B using exact allele match

    bcftools isec -p dir -n=2 -w1 A.vcf.gz B.vcf.gz

Extract records private to A or B comparing by position only

    bcftools isec -p dir -n-1 -c all A.vcf.gz B.vcf.gz

Print a list of records which are present in A and B but not in C and D

    bcftools isec -n~1100 -c all A.vcf.gz B.vcf.gz C.vcf.gz D.vcf.gz

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz […]

Merge multiple VCF/BCF files from non-overlapping sample sets to create one +the files after filters have been applied

    bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -n +2 -p dir

Extract and write records from A shared by both A and B using exact allele match

    bcftools isec -p dir -n=2 -w1 A.vcf.gz B.vcf.gz

Extract records private to A or B comparing by position only

    bcftools isec -p dir -n-1 -c all A.vcf.gz B.vcf.gz

Print a list of records which are present in A and B but not in C and D

    bcftools isec -n~1100 -c all A.vcf.gz B.vcf.gz C.vcf.gz D.vcf.gz

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz […]

Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file. For example, when merging file A.vcf.gz containing samples S1, S2 and S3 and file B.vcf.gz containing samples S3 and S4, the output file will contain four samples named S1, S2, S3, 2:S3 @@ -1550,7 +1570,7 @@ unique across all files. If they are not, the program will exit with an error unless the option --force-samples is given. The sample names can be also given explicitly using the --print-header and --use-header options.

Note that only records from different files can be merged, never from the same file. -For "vertical" merge take a look at bcftools concat or bcftools norm -m instead.

+For "vertical" merge take a look at bcftools concat or bcftools norm -m instead.

--force-samples
if the merged files contain duplicate samples names, proceed anyway. @@ -1620,7 +1640,7 @@ -m indels .. allow multiallelic indel records -m both .. both SNP and indel records can be multiallelic -m all .. SNP records can be merged with indel records --m id .. merge by ID
+-m id .. merge by ID
--no-index
the option allows to merge files without indexing them first. In order for this @@ -1650,7 +1670,7 @@ --threads INT
see Common Options -

bcftools mpileup [OPTIONS] -f ref.fa in.bam [in2.bam […]]

Generate VCF or BCF containing genotype likelihoods for one or multiple +

bcftools mpileup [OPTIONS] -f ref.fa in.bam [in2.bam […]]

Generate VCF or BCF containing genotype likelihoods for one or multiple alignment (BAM or CRAM) files. This is based on the original samtools mpileup command (with the -v or -g options) producing genotype likelihoods in VCF or BCF format, but not the textual pileup @@ -1669,7 +1689,7 @@ index is used to find chromosome 20 and then it is filtered for the regions listed in the BED file. Also note that the -r option can be much slower than -t with many regions and can require more memory when -multiple regions and many alignment files are processed.

Input options

+multiple regions and many alignment files are processed.

Input options

-6, --illumina1.3+
Assume the quality is in the Illumina 1.3+ encoding. @@ -1694,8 +1714,7 @@ Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of being generated from the mapped position, the new mapping quality is - about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if - enabled, the recommended value for BWA is 50. [0] + about sqrt((INT-q)/INT)*INT. A zero value (the default) disables this functionality.
-d, --max-depth INT
@@ -1742,7 +1761,7 @@ RG_ID_5 FILE_1.bam SAMPLE_A RG_ID_6 FILE_2.bam SAMPLE_A * FILE_3.bam SAMPLE_C - ? FILE_3.bam SAMPLE_D
+ ? FILE_3.bam SAMPLE_D
-q, -min-MQ INT
Minimum mapping quality for an alignment to be used [0] @@ -1798,7 +1817,7 @@ -x, --ignore-overlaps
Disable read-pair overlap detection. -

Output options

+

Output options

-a, --annotate LIST
Comma-separated list of FORMAT and INFO tags to output. (case-insensitive, @@ -1820,7 +1839,7 @@ FORMAT/DP4 .. Deprecated in favor of FORMAT/ADF and FORMAT/ADR; Number of high-quality ref-forward, ref-reverse, alt-forward and alt-reverse bases (Number=4,Type=Integer) FORMAT/DPR .. Deprecated in favor of FORMAT/AD; Number of high-quality bases for each observed allele (Number=R,Type=Integer) -INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for each observed allele (Number=R,Type=Integer)
+INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for each observed allele (Number=R,Type=Integer)
-g, --gvcf INT[,…]
output gVCF blocks of homozygous REF calls, with depth (DP) ranges @@ -1851,7 +1870,7 @@ --threads INT
see Common Options -

Options for SNP/INDEL genotype likelihood computation

+

Options for SNP/INDEL genotype likelihood computation

-e, --ext-prob INT
Phred-scaled gap extension sequencing error probability. Reducing INT @@ -1896,7 +1915,7 @@ indel candidates are obtained. It is recommended to collect indel candidates from sequencing technologies that have low indel error rate such as ILLUMINA [all] -

Examples:

Call SNPs and short INDELs, then mark low quality sites and sites with the read +

Examples:

Call SNPs and short INDELs, then mark low quality sites and sites with the read depth exceeding a limit. (The read depth should be adjusted to about twice the average read depth as higher read depths usually indicate problematic regions which are often enriched for artefacts.) One may consider to add -C50 to @@ -1904,10 +1923,32 @@ mismatches. Applying this option usually helps for BWA-backtrack alignments, but may not other aligners.

    bcftools mpileup -Ou -f ref.fa aln.bam | \
     bcftools call -Ou -mv | \
-    bcftools filter -s LowQual -e '%QUAL<20 || DP>100' > var.flt.vcf

bcftools norm [OPTIONS] file.vcf.gz

Left-align and normalize indels, check if REF alleles match the reference, + bcftools filter -s LowQual -e '%QUAL<20 || DP>100' > var.flt.vcf

bcftools norm [OPTIONS] file.vcf.gz

Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows. Left-alignment and normalization will only be applied if -the --fasta-ref option is supplied.

+the --fasta-ref option is supplied.

+-a, --atomize .|* +
+ Decompose complex variants (e.g. split MNVs into consecutive SNVs). + Alleles missing because of an overlapping variant can be set either + to missing (.) or to the star alele (*), as recommended by + the VCF specification. IMPORTANT: Note that asterisk is expaneded + by shell and must be put in quotes or escaped by a backslash: +
    # Before atomization:
+    100  CC  C,GG   1/2
+
+    # After:
+    #   bcftools norm -a .
+    100  C       G      ./1
+    100  CC      C      1/.
+    101  C       G      ./1
+
+    # After:
+    #   bcftools norm -a '*'
+    #   bcftools norm -a \*
+    100  C       G,*    2/1
+    100  CC      C,*    1/2
+    101  C       G,*    2/1
-c, --check-ref e|w|x|s
what to do when incorrect or missing REF allele is encountered: @@ -2002,13 +2043,13 @@
maximum distance between two records to consider when locally sorting variants which changed position during the realignment -

bcftools [plugin NAME|+NAME] [OPTIONS] FILE — [PLUGIN OPTIONS]

A common framework for various utilities. The plugins can be used +

bcftools [plugin NAME|+NAME] [OPTIONS] FILE — [PLUGIN OPTIONS]

A common framework for various utilities. The plugins can be used the same way as normal commands only their name is prefixed with "+". Most plugins accept two types of parameters: general options shared by all plugins followed by a separator, and a list of plugin-specific options. There are some exceptions to this rule, some plugins do not accept the common options and implement their own parameters. Therefore please pay attention to -the usage examples that each plugin comes with.

VCF input options:

+the usage examples that each plugin comes with.

VCF input options:

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -2034,7 +2075,7 @@ -T, --targets-file file
see Common Options -

VCF output options:

+

VCF output options:

--no-version
see Common Options @@ -2050,7 +2091,7 @@ --threads INT
see Common Options -

Plugin options:

+

Plugin options:

-h, --help
list plugin’s options @@ -2071,7 +2112,7 @@ -V, --version
print version string and exit -

List of plugins coming with the distribution:

+

List of plugins coming with the distribution:

ad-bias
find positions with wildly varying ALT allele frequency (Fisher test on FMT/AD) @@ -2104,7 +2145,7 @@

runs a basic association test, per-site or in a region, and checks for novel alleles and genotypes in two groups of samples. Adds the following INFO annotations: -

  • +

    • PASSOC .. Fisher’s exact test probability of genotypic association (REF vs non-REF allele)
    • FASSOC .. proportion of non-REF allele in controls and cases @@ -2131,7 +2172,7 @@ fill-tags

      set various INFO tags. The list of tags supported in this version: -

      • +

        • INFO/AC Number:A Type:Integer .. Allele count in genotypes
        • INFO/AC_Hom Number:A Type:Integer .. Allele counts in homozygous genotypes @@ -2250,7 +2291,7 @@
          convert between similar tags, such as GL and GP
          -trio-dnm +trio-dnm2
          screen variants for possible de-novo mutations in trios
          @@ -2266,7 +2307,7 @@ variantkey-hex
          generate unsorted VariantKey-RSid index files in hexadecimal format -

Examples:

# List options common to all plugins
+

Examples:

# List options common to all plugins
 bcftools plugin
 
 # List available plugins
@@ -2291,11 +2332,11 @@
 bcftools +missing2ref in.vcf
 
 # Replace missing genotypes with 0|0
-bcftools +missing2ref in.vcf -- -p

Plugins troubleshooting:

Things to check if your plugin does not show up in the bcftools plugin -l output:

  • +bcftools +missing2ref in.vcf -- -p

Plugins troubleshooting:

Things to check if your plugin does not show up in the bcftools plugin -l output:

  • Run with the -v option for verbose output: bcftools plugin -lv
  • Does the environment variable BCFTOOLS_PLUGINS include the correct path? -

Plugins API:

// Short description used by 'bcftools plugin -l'
+

Plugins API:

// Short description used by 'bcftools plugin -l'
 const char *about(void);
 
 // Longer description used by 'bcftools +name -h'
@@ -2310,10 +2351,10 @@
 bcf1_t *process(bcf1_t *rec);
 
 // Called after all lines have been processed to clean up
-void destroy(void);

bcftools polysomy [OPTIONS] file.vcf.gz

Detect number of chromosomal copies in VCFs annotates with the Illumina’s +void destroy(void);

bcftools polysomy [OPTIONS] file.vcf.gz

Detect number of chromosomal copies in VCFs annotates with the Illumina’s B-allele frequency (BAF) values. Note that this command is not compiled in by default, see the section Optional Compilation with GSL in the INSTALL -file for help.

General options:

+file for help.

General options:

-o, --output-dir path
output directory @@ -2342,7 +2383,7 @@
verbose debugging output which gives hints about the thresholds and decisions made by the program. Note that the exact output can change between versions. -

Algorithm options:

+

Algorithm options:

-b, --peak-size float
the minimum peak size considered as a good match can be from the interval [0,1] @@ -2374,7 +2415,7 @@
a heuristics to filter failed fits where the expected peak symmetry is violated. The float is from the interval [0,1] and larger is stricter -

bcftools query [OPTIONS] file.vcf.gz [file.vcf.gz […]]

Extracts fields from VCF or BCF files and outputs them in user-defined format.

+

bcftools query [OPTIONS] file.vcf.gz [file.vcf.gz […]]

Extracts fields from VCF or BCF files and outputs them in user-defined format.

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -2433,7 +2474,7 @@ -v, --vcf-list FILE
process multiple VCFs listed in the file -

Format:

%CHROM          The CHROM column (similarly also other columns: POS, ID, REF, ALT, QUAL, FILTER)
+

Format:

%CHROM          The CHROM column (similarly also other columns: POS, ID, REF, ALT, QUAL, FILTER)
 %END            End position of the REF allele
 %END0           End position of the REF allele in 0-based coordinates
 %FIRST_ALT      Alias for %ALT{0}
@@ -2454,7 +2495,7 @@
 %TYPE           Variant type (REF, SNP, MNP, INDEL, BND, OTHER)
 []              Format fields must be enclosed in brackets to loop over all samples
 \n              new line
-\t              tab character
Everything else is printed verbatim.

Examples:

# Print chromosome, position, ref allele and the first alternate allele
+\t              tab character
Everything else is printed verbatim.

Examples:

# Print chromosome, position, ref allele and the first alternate allele
 bcftools query -f '%CHROM  %POS  %REF  %ALT{0}\n' file.vcf.gz
# Similar to above, but use tabs instead of spaces, add sample name and genotype
 bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n' file.vcf.gz
# Print FORMAT/GT fields followed by FORMAT/GT fields
 bcftools query -f 'GQ:[ %GQ] \t GT:[ %GT]\n' file.vcf
# Make a BED file: chr, pos (0-based), end pos (1-based), id
@@ -2464,7 +2505,7 @@
 bcftools query -i'GT="het"' -f'[%CHROM:%POS %SAMPLE %GT %pbinom(AD)\n]' file.vcf
# Print the second value of AC field if bigger than 10. Note the (unfortunate) difference in
 # index subscript notation: formatting expressions (-f) uses "{}" while filtering expressions
 # (-i) use "[]". This is for historic reasons and backward-compatibility.
-bcftools query -f '%AC{1}\n' -i 'AC[1]>10' file.vcf.gz

bcftools reheader [OPTIONS] file.vcf.gz

Modify header of VCF/BCF files, change sample names.

+bcftools query -f '%AC{1}\n' -i 'AC[1]>10' file.vcf.gz

bcftools reheader [OPTIONS] file.vcf.gz

Modify header of VCF/BCF files, change sample names.

-f, --fai FILE
add to the header contig names and their lengths from the provided fasta index file (.fai). @@ -2491,8 +2532,8 @@ --threads INT
see Common Options -

bcftools roh [OPTIONS] file.vcf.gz

A program for detecting runs of homo/autozygosity. Only bi-allelic sites -are considered.

The HMM model:

Notation:
+

bcftools roh [OPTIONS] file.vcf.gz

A program for detecting runs of homo/autozygosity. Only bi-allelic sites +are considered.

The HMM model:

Notation:
   D  = Data, AZ = autozygosity, HW = Hardy-Weinberg (non-autozygosity),
   f  = non-ref allele frequency
 
@@ -2509,7 +2550,7 @@
   HWi = P_i(HW)
 
   P_{i+1}(AZ) = oAZ * max[(1 - tAZ * ci) * AZ{i-1} , tAZ * ci * (1-AZ{i-1})]
-  P_{i+1}(HW) = oHW * max[(1 - tHW * ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}]

General Options:

+ P_{i+1}(HW) = oHW * max[(1 - tHW * ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}]

General Options:

--AF-dflt FLOAT
in case allele frequency is not known, use the FLOAT. By default, sites where @@ -2528,7 +2569,7 @@ bgzip and indexed with tabix -s1 -b2 -e2. Sites which are not present in the FILE or have different reference or alternate allele will be skipped. Note that such a file can be easily created from a VCF using: -
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\t%INFO/TAG\n' file.vcf | bgzip -c > freqs.tab.gz
+
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\t%INFO/TAG\n' file.vcf | bgzip -c > freqs.tab.gz
-b, --buffer-size INT[,INT]
when the entire many-sample file cannot fit into memory, a sliding @@ -2617,7 +2658,7 @@ -T, --targets-file file
see Common Options -

HMM Options:

+

HMM Options:

-a, --hw-to-az FLOAT
P(AZ|HW) transition probability from AZ (autozygous) to HW (Hardy-Weinberg) state @@ -2630,7 +2671,7 @@
estimate HMM parameters using Baum-Welch algorithm, using the convergence threshold FLOAT, e.g. 1e-10 (experimental) -

bcftools sort [OPTIONS] file.bcf

+

bcftools sort [OPTIONS] file.bcf

-m, --max-mem FLOAT[kMG]
Maximum memory to use. Approximate, affects the number of temporary files written @@ -2648,7 +2689,7 @@ -T, --temp-dir DIR
Use this directory to store temporary files -

bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]

Parses VCF or BCF and produces text file stats which is suitable for machine +

bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]

Parses VCF or BCF and produces text file stats which is suitable for machine processing and can be plotted using plot-vcfstats. When two files are given, the program generates separate stats for intersection and the complements. By default only sites are compared, -s/-S must given to include also sample @@ -2658,7 +2699,7 @@ etc. are printed. When two VCF files are given, then stats such as concordance (Genotype concordance by non-reference allele frequency, Genotype concordance by sample, Non-Reference Discordance) -and correlation are also printed. Per-site discordance (PSD) is also printed in --verbose mode.

+and correlation are also printed. Per-site discordance (PSD) is also printed in --verbose mode.

--af-bins LIST|FILE
comma separated list of allele frequency bins (e.g. 0.1,0.5,1) @@ -2695,7 +2736,7 @@ tab-delimited file with exons for indel frameshifts statistics. The columns of the file are CHR, FROM, TO, with 1-based, inclusive, positions. The file is BGZF-compressed and indexed with tabix -
    tabix -s1 -b2 -e3 file.gz
+
    tabix -s1 -b2 -e3 file.gz
-f, --apply-filters LIST
see Common Options @@ -2745,8 +2786,8 @@ -v, --verbose
produce verbose per-site and per-sample output -

bcftools view [OPTIONS] file.vcf.gz [REGION […]]

View, subset and filter VCF or BCF files by position and filtering expression. -Convert between VCF and BCF. Former bcftools subset.

Output options

+

bcftools view [OPTIONS] file.vcf.gz [REGION […]]

View, subset and filter VCF or BCF files by position and filtering expression. +Convert between VCF and BCF. Former bcftools subset.

Output options

-G, --drop-genotypes
drop individual genotype information (after subsetting if -s option is set) @@ -2772,7 +2813,7 @@
see Common Options

-o, --output FILE: - output file name. If not present, the default is to print to standard output (stdout).

+ output file name. If not present, the default is to print to standard output (stdout).

-r, --regions chr|chr:pos|chr:from-to|chr:from-[,…]
see Common Options @@ -2792,7 +2833,7 @@ --threads INT
see Common Options -

Subset options:

+

Subset options:

-a, --trim-alt-alleles
remove alleles not seen in the genotype fields from the ALT column. Note that if no alternate allele @@ -2817,7 +2858,7 @@
see Common Options. Note that it is possible to create multiple subsets simultaneously using the split plugin. -

Filter options:

Note that filter options below dealing with counting the number of alleles +

Filter options:

Note that filter options below dealing with counting the number of alleles will, for speed, first check for the values of AC and AN in the INFO column to avoid parsing all the genotype (FORMAT/GT) fields in the VCF. This means that a filter like --min-af 0.1 will be calculated from INFO/AC and INFO/AN @@ -2828,7 +2869,7 @@ and some are inherently ambiguous, for example allele counts can be taken from the INFO column when present but calculated on the fly when absent. Therefore it is strongly recommended to spell out the required order explicitly by separating such commands into two steps. (Make sure to use the -O u option -when piping!)

+when piping!)

-c, --min-ac INT[:nref|:alt1|:minor|:major|:'nonmajor']
minimum allele count (INFO/AC) of sites to be printed. @@ -2939,10 +2980,10 @@ -X, --exclude-private
exclude sites where only the subset samples carry an non-reference allele -

bcftools help [COMMAND] | bcftools --help [COMMAND]

Display a brief usage message listing the bcftools commands available. +

bcftools help [COMMAND] | bcftools --help [COMMAND]

Display a brief usage message listing the bcftools commands available. If the name of a command is also given, e.g., bcftools help view, the detailed -usage message for that particular command is displayed.

bcftools [--version|-v]

Display the version numbers and copyright information for bcftools and the -important libraries used by bcftools.

bcftools [--version-only]

Display the full bcftools version number in a machine-readable format.

EXPRESSIONS

These filtering expressions are accepted by most of the commands.

Valid expressions may contain:

  • +usage message for that particular command is displayed.

bcftools [--version|-v]

Display the version numbers and copyright information for bcftools and the +important libraries used by bcftools.

bcftools [--version-only]

Display the full bcftools version number in a machine-readable format.

EXPRESSIONS

These filtering expressions are accepted by most of the commands.

Valid expressions may contain:

  • numerical constants, string constants, file names (this is currently supported only to filter by the ID column)

    1, 1.0, 1e-4
    @@ -3052,7 +3093,7 @@
     the section Optional Compilation with Perl in the INSTALL file for help
     and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
     "severity" which can be invoked from the command line as follows:
    -

    perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3

Notes:

  • +

    perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3

Notes:

  • String comparisons and regular expressions are case-insensitive
  • Comma in strings is interpreted as a separator and when multiple values are compared, the OR logic is used. @@ -3076,9 +3117,9 @@ -e 'TAG[0]!=1' .. true

Examples:

MIN(DV)>5       .. selects the whole site, evaluates min across all values and samples
SMPL_MIN(DV)>5  .. selects matching samples, evaluates within samples
MIN(DV/DP)>0.3
MIN(DP)>10 & MIN(DV)>3
FMT/DP>10  & FMT/GQ>10 .. both conditions must be satisfied within one sample
FMT/DP>10 && FMT/GQ>10 .. the conditions can be satisfied in different samples
QUAL>10 |  FMT/GQ>10   .. true for sites with QUAL>10 or a sample with GQ>10, but selects only samples with GQ>10
QUAL>10 || FMT/GQ>10   .. true for sites with QUAL>10 or a sample with GQ>10, plus selects all samples at such sites
TYPE="snp" && QUAL>=10 && (DP4[2]+DP4[3] > 2)
COUNT(GT="hom")=0      .. no homozygous genotypes at the site
AVG(GQ)>50             .. average (arithmetic mean) of genotype qualities bigger than 50
ID=@file       .. selects lines with ID present in the file
ID!=@~/file    .. skip lines with ID present in the ~/file
MAF[0]<0.05    .. select rare variants at 5% cutoff
POS>=100   .. restrict your range query, e.g. 20:100-200 to strictly sites with POS in that range.

Shell expansion:

Note that expressions must often be quoted because some characters have special meaning in the shell. An example of expression enclosed in single quotes which cause -that the whole expression is passed to the program as intended:

bcftools view -i '%ID!="." & MAF[0]<0.01'

Please refer to the documentation of your shell for details.

SCRIPTS AND OPTIONS

plot-vcfstats [OPTIONS] file.vchk […]

Script for processing output of bcftools stats. It can merge +that the whole expression is passed to the program as intended:

bcftools view -i '%ID!="." & MAF[0]<0.01'

Please refer to the documentation of your shell for details.

SCRIPTS AND OPTIONS

plot-vcfstats [OPTIONS] file.vchk […]

Script for processing output of bcftools stats. It can merge results from multiple outputs (useful when running the stats for each -chromosome separately), plots graphs and creates a PDF presentation.

+chromosome separately), plots graphs and creates a PDF presentation.

-m, --merge
Merge vcfstats files to STDOUT, skip plotting. @@ -3116,15 +3157,15 @@ bcftools stats -s - > file.vchk
# Plot the stats
 plot-vcfstats -p outdir file.vchk
# The final looks can be customized by editing the generated
 # 'outdir/plot.py' script and re-running manually
-cd outdir && python plot.py && pdflatex summary.tex

PERFORMANCE

HTSlib was designed with BCF format in mind. When parsing VCF files, all records +cd outdir && python plot.py && pdflatex summary.tex

PERFORMANCE

HTSlib was designed with BCF format in mind. When parsing VCF files, all records are internally converted into BCF representation. Simple operations, like removing a single column from a VCF file, can be therefore done much faster with standard UNIX commands, such as awk or cut. Therefore it is recommended to use BCF as input/output format whenever possible to avoid -large overhead of the VCF → BCF → VCF conversion.

BUGS

Please report any bugs you encounter on the github website: http://github.com/samtools/bcftools

AUTHORS

Heng Li from the Sanger Institute wrote the original C version of htslib, +large overhead of the VCF → BCF → VCF conversion.

BUGS

Please report any bugs you encounter on the github website: http://github.com/samtools/bcftools

AUTHORS

Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining and further developing bcftools. Many other people contributed to the program and to the file format specifications, both directly and indirectly by -providing patches, testing and reporting bugs. We thank them all.

RESOURCES

BCFtools GitHub website: http://github.com/samtools/bcftools

Samtools GitHub website: http://github.com/samtools/samtools

HTSlib GitHub website: http://github.com/samtools/htslib

File format specifications: http://samtools.github.io/hts-specs

BCFtools documentation: http://samtools.github.io/bcftools

BCFtools wiki page: https://github.com/samtools/bcftools/wiki

COPYING

The MIT/Expat License or GPL License, see the LICENSE document for details. -Copyright (c) Genome Research Ltd.

+providing patches, testing and reporting bugs. We thank them all.

RESOURCES

BCFtools GitHub website: http://github.com/samtools/bcftools

Samtools GitHub website: http://github.com/samtools/samtools

HTSlib GitHub website: http://github.com/samtools/htslib

File format specifications: http://samtools.github.io/hts-specs

BCFtools documentation: http://samtools.github.io/bcftools

BCFtools wiki page: https://github.com/samtools/bcftools/wiki

COPYING

The MIT/Expat License or GPL License, see the LICENSE document for details. +Copyright (c) Genome Research Ltd.

\ No newline at end of file diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 66c5e1932..837e6a733 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1983,6 +1983,34 @@ split multiallelic sites into multiple rows; recover multiallelics from multiple rows. Left-alignment and normalization will only be applied if the *<>* option is supplied. +*-a, --atomize*:: + Decompose complex variants, e.g. split MNVs into consecutive SNVs. + See also *--atom-overlaps* and *--old-rec-tag*. + +*--atom-overlaps* '.'|'*':: + Alleles missing because of an overlapping variant can be set either + to missing (.) or to the star alele (*), as recommended by + the VCF specification. IMPORTANT: Note that asterisk is expaneded + by shell and must be put in quotes or escaped by a backslash: +---- + # Before atomization: + 100 CC C,GG 1/2 + + # After: + # bcftools norm -a . + 100 C G ./1 + 100 CC C 1/. + 101 C G ./1 + + # After: + # bcftools norm -a '*' + # bcftools norm -a \* + 100 C G,* 2/1 + 100 CC C,* 1/2 + 101 C G,* 2/1 + +---- + *-c, --check-ref* 'e'|'w'|'x'|'s':: what to do when incorrect or missing REF allele is encountered: exit ('e'), warn ('w'), exclude ('x'), or set/fix ('s') bad sites. @@ -2032,6 +2060,10 @@ the *<>* option is supplied. reference '-f'. The '-N' option will not turn on indel normalisation as the '-f' option normally implies +*--old-rec-tag* 'STR':: + Add INFO/STR annotation with the original record. The format of the + annotation is CHROM|POS|REF|ALT|USED_ALT_IDX. + *-o, --output* 'FILE':: see *<>* diff --git a/test/atomize.split.1.1.out b/test/atomize.split.1.1.out new file mode 100644 index 000000000..20b6d25ec --- /dev/null +++ b/test/atomize.split.1.1.out @@ -0,0 +1,46 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +##contig= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 +11 101 . G C,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|4 GT 2 0 0 1 +11 101 . GCGT G,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|1 GT 1 2 2 2 +11 102 . C T,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|3 GT 2 0 1 0 +11 104 . T A,* . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|2,3 GT 2 1 1 0 +11 201 . C G,* . . OLD_REC=11|201|CC|GG,GT|1,2 GT 0 1 1 0 +11 202 . C G,* . . OLD_REC=11|201|CC|GG,GT|1 GT 0 1 2 0 +11 202 . C T,* . . OLD_REC=11|201|CC|GG,GT|2 GT 0 2 1 0 +12 101 rs101 G C,* 199 flt INDEL;AN=4;AC=4,.;DP=19;ISTR=SomeString;XRF=0,4,.;XRI=0,4,.;XAF=4,.;XAI=444,.;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|4 GT 0/2 2/0 0/0 0/1 +12 101 rs101 GCGT G,* 199 flt INDEL;AN=4;AC=1,.;DP=19;ISTR=SomeString;XRF=0,10,.;XRI=0,1111,.;XAF=10,.;XAI=1111,.;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|1 GT 0/1 1/2 2/2 2/2 +12 102 rs101 C T,* 199 flt INDEL;AN=4;AC=3,.;DP=19;ISTR=SomeString;XRF=0,300000,.;XRI=0,3333,.;XAF=3,.;XAI=33,.;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|3 GT 0/2 2/0 0/1 0/0 +12 104 rs101 T A,* 199 flt INDEL;AN=4;AC=2,.;DP=19;ISTR=SomeString;XRF=0,200,.;XRI=0,2222,.;XAF=200000,.;XAI=22,.;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|2,3 GT 0/2 2/1 1/1 1/0 +12 201 . C G,* . . OLD_REC=12|201|CC|GG,GT|1,2 GT:FSTR:FFI:FFF:FAF:FAI:FRF:FRI:PL 0/0:xx:0:0:1.1,.:1,.:0,0,.:0,0,.:0,1,2,.,.,. 1:yy:11:1.1:1.1,.:1,.:0,0,.:0,0,.:0,2,. 1/1:zz:22:2.2:1.1,.:1,.:0,0,.:0,0,.:0,1,2,.,.,. 0:.:.:.:.,.:.,.:.,.,.:.,.,.:. +12 202 . C G,* . . OLD_REC=12|201|CC|GG,GT|1 GT:FSTR:FFI:FFF:FAF:FAI:FRF:FRI:PL 0/0:xx:0:0:1.1,.:1,.:0,0,.:0,0,.:0,1,2,.,.,. 1:yy:11:1.1:1.1,.:1,.:0,0,.:0,0,.:0,2,. 2/2:zz:22:2.2:1.1,.:1,.:0,0,.:0,0,.:0,1,2,.,.,. 0:.:.:.:.,.:.,.:.,.,.:.,.,.:. +12 202 . C T,* . . OLD_REC=12|201|CC|GG,GT|2 GT:FSTR:FFI:FFF:FAF:FAI:FRF:FRI:PL 0/0:xx:0:0:2.2,.:2,.:0,1.1,.:0,1,.:0,3,5,.,.,. 2:yy:11:1.1:2.2,.:2,.:0,1.1,.:0,1,.:0,5,. 1/1:zz:22:2.2:2.2,.:2,.:0,1.1,.:0,1,.:0,3,5,.,.,. 0:.:.:.:::.:.:. diff --git a/test/atomize.split.1.2.out b/test/atomize.split.1.2.out new file mode 100644 index 000000000..6d1b45054 --- /dev/null +++ b/test/atomize.split.1.2.out @@ -0,0 +1,46 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +##contig= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 +11 101 . G C . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|4 GT . 0 0 1 +11 101 . GCGT G . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|1 GT 1 . . . +11 102 . C T . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|3 GT . 0 1 0 +11 104 . T A . . OLD_REC=11|101|GCGT|G,GCGA,GTGA,CCGT|2,3 GT . 1 1 0 +11 201 . C G . . OLD_REC=11|201|CC|GG,GT|1,2 GT 0 1 1 0 +11 202 . C G . . OLD_REC=11|201|CC|GG,GT|1 GT 0 1 . 0 +11 202 . C T . . OLD_REC=11|201|CC|GG,GT|2 GT 0 . 1 0 +12 101 rs101 G C 199 flt INDEL;AN=4;AC=4;DP=19;ISTR=SomeString;XRF=0,4;XRI=0,4;XAF=4;XAI=444;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|4 GT 0/. ./0 0/0 0/1 +12 101 rs101 GCGT G 199 flt INDEL;AN=4;AC=1;DP=19;ISTR=SomeString;XRF=0,10;XRI=0,1111;XAF=10;XAI=1111;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|1 GT 0/1 1/. ./. ./. +12 102 rs101 C T 199 flt INDEL;AN=4;AC=3;DP=19;ISTR=SomeString;XRF=0,300000;XRI=0,3333;XAF=3;XAI=33;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|3 GT 0/. ./0 0/1 0/0 +12 104 rs101 T A 199 flt INDEL;AN=4;AC=2;DP=19;ISTR=SomeString;XRF=0,200;XRI=0,2222;XAF=200000;XAI=22;OLD_REC=12|101|GCGT|G,GCGA,GTGA,CCGT|2,3 GT 0/. ./1 1/1 1/0 +12 201 . C G . . OLD_REC=12|201|CC|GG,GT|1,2 GT:FSTR:FFI:FFF:FAF:FAI:FRF:FRI:PL 0/0:xx:0:0:1.1:1:0,0:0,0:0,1,2 1:yy:11:1.1:1.1:1:0,0:0,0:0,2 1/1:zz:22:2.2:1.1:1:0,0:0,0:0,1,2 0:.:.:.:.:.:.,.:.,.:. +12 202 . C G . . OLD_REC=12|201|CC|GG,GT|1 GT:FSTR:FFI:FFF:FAF:FAI:FRF:FRI:PL 0/0:xx:0:0:1.1:1:0,0:0,0:0,1,2 1:yy:11:1.1:1.1:1:0,0:0,0:0,2 ./.:zz:22:2.2:1.1:1:0,0:0,0:0,1,2 0:.:.:.:.:.:.,.:.,.:. +12 202 . C T . . OLD_REC=12|201|CC|GG,GT|2 GT:FSTR:FFI:FFF:FAF:FAI:FRF:FRI:PL 0/0:xx:0:0:2.2:2:0,1.1:0,1:0,3,5 .:yy:11:1.1:2.2:2:0,1.1:0,1:0,5 1/1:zz:22:2.2:2.2:2:0,1.1:0,1:0,3,5 0:.:.:.:::.:.:. diff --git a/test/atomize.split.1.vcf b/test/atomize.split.1.vcf new file mode 100644 index 000000000..3489a323e --- /dev/null +++ b/test/atomize.split.1.vcf @@ -0,0 +1,34 @@ +##fileformat=VCFv4.3 +##contig= +##contig= +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 S4 +11 101 . GCGT G,GCGA,GTGA,CCGT . . . GT 1 2 3 4 +11 201 . CC GG,GT . . . GT 0 1 2 0 +12 101 rs101 GCGT G,GCGA,GTGA,CCGT 199 flt INDEL;AN=4;AC=1,2,3,4;DP=19;ISTR=SomeString;XRF=0,1e+01,2e+02,300000,4;XRI=0,1111,2222,3333,4;XRS=000,AAA,BBB,DDD,xx;XAF=1e+01,200000,3,4.0;XAI=1111,22,33,444;XAS=AAA,DDD,xx,zzz;XGF=1e+01,2e+02,3e+03,500000,.,9e+09,7;XGI=1111,2222,3333,5555,.,9999,7;XGS=A,B,C,E,.,F,x GT 0/1 1/2 2/3 2/4 +12 201 . CC GG,GT . . . GT:FSTR:FFI:FFF:FAF:FAI:FAS:FRF:FRI:FRS:PL 0/0:xx:00:0.0:1.1,2.2:1,2:a,b:0.0,1.1,2.2:0,1,2:a,b,c:0,1,2,3,4,5 1:yy:11:1.1:1.1,2.2:1,2:a,b:0.0,1.1,2.2:0,1,2:a,b,c:0,2,5 2/2:zz:22:2.2:1.1,2.2:1,2:a,b:0.0,1.1,2.2:0,1,2:a,b,c:0,1,2,3,4,5 0 diff --git a/test/atomize.split.2.1.out b/test/atomize.split.2.1.out new file mode 100644 index 000000000..a9e563b54 --- /dev/null +++ b/test/atomize.split.2.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +##FORMAT= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 100 . C G . . OLD_REC=1|100|CC|GG|1 GT 0/1 +1 101 . C G . . OLD_REC=1|100|CC|GG|1 GT 0/1 +1 110 . C T,<*> . . . GT 0/1 diff --git a/test/atomize.split.2.2.out b/test/atomize.split.2.2.out new file mode 100644 index 000000000..09d4e25eb --- /dev/null +++ b/test/atomize.split.2.2.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 100 . C G . . . GT ./1 +1 100 . CC C . . . GT 1/. +1 101 . C G . . . GT ./1 diff --git a/test/atomize.split.2.vcf b/test/atomize.split.2.vcf new file mode 100644 index 000000000..ef43b6f3d --- /dev/null +++ b/test/atomize.split.2.vcf @@ -0,0 +1,6 @@ +##fileformat=VCFv4.3 +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +1 100 . CC GG . . . GT 0/1 +1 110 . C T,<*> . . . GT 0/1 diff --git a/test/test.pl b/test/test.pl index d4df585a6..ba3a26f4d 100755 --- a/test/test.pl +++ b/test/test.pl @@ -232,6 +232,10 @@ test_vcf_norm($opts,in=>'norm.2',fai=>'norm.2',out=>'norm.2.out',args=>''); test_vcf_norm($opts,in=>'norm.iupac',fai=>'norm.iupac',out=>'norm.iupac.out',args=>'-c s'); test_vcf_norm($opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.out',args=>'-c s'); +test_vcf_norm($opts,in=>'atomize.split.1',out=>'atomize.split.1.1.out',args=>'--atomize --old-rec-tag OLD_REC'); +test_vcf_norm($opts,in=>'atomize.split.1',out=>'atomize.split.1.2.out',args=>'--atomize --atom-overlaps . --old-rec-tag OLD_REC'); +test_vcf_norm($opts,in=>'atomize.split.2',out=>'atomize.split.2.1.out',args=>'--atomize --old-rec-tag OLD_REC'); +test_vcf_norm($opts,in=>'atomize.split.2',out=>'atomize.split.2.1.out',args=>'--atomize --atom-overlaps . --old-rec-tag OLD_REC'); test_vcf_view($opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>''); test_vcf_view($opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y'); test_vcf_view($opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>''); diff --git a/vcfnorm.c b/vcfnorm.c index 24e68e2a6..356252a00 100644 --- a/vcfnorm.c +++ b/vcfnorm.c @@ -39,6 +39,7 @@ THE SOFTWARE. */ #include #include "bcftools.h" #include "rbuf.h" +#include "abuf.h" #define CHECK_REF_EXIT 1 #define CHECK_REF_WARN 2 @@ -85,13 +86,13 @@ typedef struct int32_t *int32_arr; int ntmp_arr1, ntmp_arr2, nint32_arr; kstring_t *tmp_str; - kstring_t *tmp_als, tmp_als_str; + kstring_t *tmp_als, tmp_kstr; int ntmp_als; rbuf_t rbuf; int buf_win; // maximum distance between two records to consider int aln_win; // the realignment window size (maximum repeat size) bcf_srs_t *files; // using the synced reader only for -r option - bcf_hdr_t *hdr; + bcf_hdr_t *hdr, *out_hdr; cmpals_t cmpals_in, cmpals_out; faidx_t *fai; struct { int tot, set, swap; } nref; @@ -99,6 +100,11 @@ typedef struct int argc, rmdup, output_type, n_threads, check_ref, strict_filter, do_indels; int nchanged, nskipped, nsplit, ntotal, mrows_op, mrows_collapse, parsimonious; int record_cmd_line, force, force_warned, keep_sum_ad; + abuf_t *abuf; + abuf_opt_t atomize; + int use_star_allele; + char *old_rec_tag; + htsFile *out; } args_t; @@ -159,7 +165,7 @@ static void fix_ref(args_t *args, bcf1_t *line) line->d.allele[0][0] = ref[0]; args->nref.set++; free(ref); - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); return; } @@ -173,7 +179,7 @@ static void fix_ref(args_t *args, bcf1_t *line) if ( has_non_acgtn ) { args->nref.set++; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } } @@ -196,7 +202,7 @@ static void fix_ref(args_t *args, bcf1_t *line) if ( fix ) { args->nref.set++; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; } } @@ -218,7 +224,7 @@ static void fix_ref(args_t *args, bcf1_t *line) kputc(',',&str); kputs(line->d.allele[i],&str); } - bcf_update_alleles_str(args->hdr,line,str.s); + bcf_update_alleles_str(args->out_hdr,line,str.s); free(ref); free(str.s); return; @@ -234,7 +240,7 @@ static void fix_ref(args_t *args, bcf1_t *line) else kputs(line->d.allele[j],&str); } - bcf_update_alleles_str(args->hdr,line,str.s); + bcf_update_alleles_str(args->out_hdr,line,str.s); args->nref.swap++; free(ref); free(str.s); @@ -252,7 +258,7 @@ static void fix_ref(args_t *args, bcf1_t *line) else if ( gts[j]==bcf_gt_unphased(i) ) gts[j] = bcf_gt_unphased(0); else if ( gts[j]==bcf_gt_phased(i) ) gts[j] = bcf_gt_phased(0); } - bcf_update_genotypes(args->hdr,line,gts,ngts); + bcf_update_genotypes(args->out_hdr,line,gts,ngts); // update AC int nac = bcf_get_info_int32(args->hdr, line, "AC", &args->tmp_arr1, &ntmp); @@ -261,7 +267,7 @@ static void fix_ref(args_t *args, bcf1_t *line) { int32_t *ac = (int32_t*)args->tmp_arr1; ac[i-1] = ni; - bcf_update_info_int32(args->hdr, line, "AC", ac, nac); + bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac); } } @@ -287,7 +293,7 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) if ( !args->tmp_arr1[i] ) continue; line->d.allele[j++] = line->d.allele[i]; } - bcf_update_alleles(args->hdr, line, (const char**)line->d.allele, nals); + bcf_update_alleles(args->out_hdr, line, (const char**)line->d.allele, nals); // update genotypes @@ -305,7 +311,36 @@ static void fix_dup_alt(args_t *args, bcf1_t *line) gts[i] = bcf_gt_is_phased(gts[i]) ? bcf_gt_phased(ial_new) : bcf_gt_unphased(ial_new); changed = 1; } - if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts); + if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts); +} + +static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt) +{ + if ( !args->old_rec_tag ) return; + + // only update if the tag is not present already, there can be multiple normalization steps + int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag); + bcf_unpack(dst, BCF_UN_INFO); + for (i=0; in_info; i++) + { + bcf_info_t *inf = &dst->d.info[i]; + if ( inf && inf->key == id ) return; + } + + args->tmp_kstr.l = 0; + ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]); + for (i=1; in_allele; i++) + { + kputs(src->d.allele[i],&args->tmp_kstr); + if ( i+1n_allele ) kputc(',',&args->tmp_kstr); + } + if ( ialt>0 ) + { + args->tmp_kstr.s[args->tmp_kstr.l-1] = '|'; + kputw(ialt,&args->tmp_kstr); + } + if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 ) + error("An error occurred while updating INFO/%s\n",args->old_rec_tag); } #define ERR_DUP_ALLELE -2 @@ -352,7 +387,7 @@ static int realign(args_t *args, bcf1_t *line) if ( line->rlen > 1 ) { line->d.allele[0][1] = 0; - bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele); + bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele); } return ERR_OK; } @@ -382,7 +417,7 @@ static int realign(args_t *args, bcf1_t *line) } // trim from right - int ori_pos = line->pos; + int new_pos = line->pos; while (1) { // is the rightmost base identical in all alleles? @@ -393,7 +428,7 @@ static int realign(args_t *args, bcf1_t *line) if ( als[i].l < min_len ) min_len = als[i].l; } if ( i!=line->n_allele ) break; // there are differences, cannot be trimmed - if ( min_len<=1 && line->pos==0 ) break; + if ( min_len<=1 && new_pos==0 ) break; int pad_from_left = 0; for (i=0; in_allele; i++) // trim all alleles @@ -403,10 +438,10 @@ static int realign(args_t *args, bcf1_t *line) } if ( pad_from_left ) { - int npad = line->pos >= args->aln_win ? args->aln_win : line->pos; + int npad = new_pos >= args->aln_win ? args->aln_win : new_pos; free(ref); - ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos-npad, line->pos-1, &nref); - if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos-npad+1); + ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, new_pos-npad, new_pos-1, &nref); + if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) new_pos-npad+1); replace_iupac_codes(ref,nref); for (i=0; in_allele; i++) { @@ -415,7 +450,7 @@ static int realign(args_t *args, bcf1_t *line) memcpy(als[i].s,ref,npad); als[i].l += npad; } - line->pos -= npad; + new_pos -= npad; } } free(ref); @@ -441,32 +476,36 @@ static int realign(args_t *args, bcf1_t *line) memmove(als[i].s,als[i].s+ntrim_left,als[i].l-ntrim_left); als[i].l -= ntrim_left; } - line->pos += ntrim_left; + new_pos += ntrim_left; } // Have the alleles changed? als[0].s[ als[0].l ] = 0; // in order for strcmp to work - if ( ori_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; + if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK; + + set_old_rec_tag(args, line, line, 0); // Create new block of alleles and update - args->tmp_als_str.l = 0; + args->tmp_kstr.l = 0; for (i=0; in_allele; i++) { - if (i>0) kputc(',',&args->tmp_als_str); - kputsn(als[i].s,als[i].l,&args->tmp_als_str); + if (i>0) kputc(',',&args->tmp_kstr); + kputsn(als[i].s,als[i].l,&args->tmp_kstr); } - args->tmp_als_str.s[ args->tmp_als_str.l ] = 0; - bcf_update_alleles_str(args->hdr,line,args->tmp_als_str.s); + args->tmp_kstr.s[ args->tmp_kstr.l ] = 0; + bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s); args->nchanged++; // Update INFO/END if necessary int new_reflen = strlen(line->d.allele[0]); - if ( (ori_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 ) + if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 ) { // bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32() + line->pos = new_pos; args->int32_arr[0] = line->pos + new_reflen; - bcf_update_info_int32(args->hdr, line, "END", args->int32_arr, 1); + bcf_update_info_int32(args->out_hdr, line, "END", args->int32_arr, 1); } + line->pos = new_pos; return ERR_OK; } @@ -496,13 +535,13 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ tag,bcf_seqname(args->hdr,src),(int64_t) src->pos+1,src->n_allele-1,ret); \ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals+ialt,1); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -518,7 +557,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -535,7 +574,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int { \ if ( ialt!=0 ) vals[1] = vals[ialt+1]; \ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals,2); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,2); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -551,7 +590,7 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int } \ if ( args->force ) \ { \ - bcf_update_info_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_info_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in INFO/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -562,10 +601,10 @@ static void split_info_numeric(args_t *args, bcf1_t *src, bcf_info_t *info, int vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \ vals[2] = vals[bcf_alleles2gt(ialt+1,ialt+1)]; \ } \ - bcf_update_info_##type(args->hdr,dst,tag,vals,3); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,3); \ } \ else \ - bcf_update_info_##type(args->hdr,dst,tag,vals,ret); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,ret); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key)) { @@ -618,7 +657,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(str.s,tmp,str.s+str.l,ialt,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_R ) { @@ -629,7 +668,7 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,ialt,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_G ) { @@ -644,16 +683,16 @@ static void split_info_string(args_t *args, bcf1_t *src, bcf_info_t *info, int i STR_MOVE_NTH(&str.s[len],tmp,str.s+str.l,iaa-i0a-1,len); if ( len<0 ) return; // wrong number of fields: skip str.s[len] = 0; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } static void split_info_flag(args_t *args, bcf1_t *src, bcf_info_t *info, int ialt, bcf1_t *dst) { const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); int ret = bcf_get_info_flag(args->hdr,src,tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_flag(args->hdr,dst,tag,NULL,ret); + bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret); } static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) @@ -679,7 +718,7 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } gt += ngts; } - bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl); + bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl); } static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) { @@ -695,7 +734,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int int i,j, nsmpl = bcf_hdr_nsamples(args->hdr); \ if ( nvals==nsmpl ) /* all values are missing */ \ { \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \ return; \ } \ if ( len==BCF_VL_A ) \ @@ -712,7 +751,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -726,7 +765,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int dst_vals += 1; \ src_vals += nvals; \ } \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -742,7 +781,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d, found %d\n", \ @@ -772,7 +811,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int src_vals += nvals; \ } \ } \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nsmpl*2); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nsmpl*2); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -788,7 +827,7 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } \ if ( args->force ) \ { \ - bcf_update_format_##type(args->hdr,dst,tag,NULL,0); \ + bcf_update_format_##type(args->out_hdr,dst,tag,NULL,0); \ return; \ } \ error("Error at %s:%"PRId64", the tag %s has wrong number of fields\n", bcf_seqname(args->hdr,src),(int64_t) src->pos+1,bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id)); \ @@ -819,10 +858,10 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int dst_vals += all_haploid ? 2 : 3; \ src_vals += nvals; \ } \ - bcf_update_format_##type(args->hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,all_haploid ? nsmpl*2 : nsmpl*3); \ } \ else \ - bcf_update_format_##type(args->hdr,dst,tag,vals,nvals); \ + bcf_update_format_##type(args->out_hdr,dst,tag,vals,nvals); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id)) { @@ -869,7 +908,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else if ( len==BCF_VL_R ) { @@ -887,7 +926,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else if ( len==BCF_VL_G ) { @@ -915,7 +954,7 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i } if ( args->force ) { - bcf_update_format_char(args->hdr,dst,tag,NULL,0); + bcf_update_format_char(args->out_hdr,dst,tag,NULL,0); return; } error("Error: wrong number of fields in FMT/%s at %s:%"PRId64", expected %d or %d, found %d\n", @@ -946,13 +985,12 @@ static void split_format_string(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int i ptr += blen; } if ( maxlenhdr,dst,tag,str.s,nsmpl*maxlen); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,nsmpl*maxlen); } else - bcf_update_format_char(args->hdr,dst,tag,str.s,str.l); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l); } - static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) { int i; @@ -985,11 +1023,11 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) // Not quite sure how to handle IDs, they can be assigned to a specific // ALT. For now we leave the ID unchanged for all. - bcf_update_id(args->hdr, dst, line->d.id ? line->d.id : "."); + bcf_update_id(args->out_hdr, dst, line->d.id ? line->d.id : "."); tmp.l = rlen; kputs(line->d.allele[i+1],&tmp); - bcf_update_alleles_str(args->hdr,dst,tmp.s); + bcf_update_alleles_str(args->out_hdr,dst,tmp.s); if ( line->d.n_flt ) bcf_update_filter(args->hdr, dst, line->d.flt, line->d.n_flt); @@ -1002,6 +1040,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line) else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst); else split_info_string(args, line, info, i, dst); } + set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes dst->n_sample = line->n_sample; for (j=0; jn_fmt; j++) @@ -1065,7 +1104,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf vals[ args->maps[i].map[k+1] - 1 ] = vals2[k]; \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -1089,7 +1128,7 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf vals[ args->maps[i].map[k] ] = vals2[k]; \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -1123,10 +1162,10 @@ static void merge_info_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_inf } \ } \ } \ - bcf_update_info_##type(args->hdr,dst,tag,args->tmp_arr1,nvals); \ + bcf_update_info_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals); \ } \ else \ - bcf_update_info_##type(args->hdr,dst,tag,vals,nvals_ori); \ + bcf_update_info_##type(args->out_hdr,dst,tag,vals,nvals_ori); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_INFO,info->key)) { @@ -1139,7 +1178,7 @@ static void merge_info_flag(args_t *args, bcf1_t **lines, int nlines, bcf_info_t { const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,info->key); int ret = bcf_get_info_flag(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_flag(args->hdr,dst,tag,NULL,ret); + bcf_update_info_flag(args->out_hdr,dst,tag,NULL,ret); } int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info_t *info, bcf1_t *dst) @@ -1167,7 +1206,7 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info str.s[str.l] = 0; args->tmp_arr1 = (uint8_t*) str.s; args->ntmp_arr1 = str.m; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else if ( len==BCF_VL_G ) { @@ -1194,12 +1233,12 @@ static void merge_info_string(args_t *args, bcf1_t **lines, int nlines, bcf_info str.s[str.l] = 0; args->tmp_arr1 = (uint8_t*) str.s; args->ntmp_arr1 = str.m; - bcf_update_info_string(args->hdr,dst,tag,str.s); + bcf_update_info_string(args->out_hdr,dst,tag,str.s); } else { bcf_get_info_string(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_info_string(args->hdr,dst,tag,args->tmp_arr1); + bcf_update_info_string(args->out_hdr,dst,tag,args->tmp_arr1); } } static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_fmt_t *fmt, bcf1_t *dst) @@ -1242,7 +1281,7 @@ static void merge_format_genotype(args_t *args, bcf1_t **lines, int nlines, bcf_ gt2 += ngts; } } - bcf_update_genotypes(args->hdr,dst,args->tmp_arr1,ngts*nsmpl); + bcf_update_genotypes(args->out_hdr,dst,args->tmp_arr1,ngts*nsmpl); } static int diploid_to_haploid(int size, int nsmpl, int nals, uint8_t *vals) { @@ -1295,7 +1334,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals2; \ } \ } \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else if ( len==BCF_VL_R ) \ { \ @@ -1323,7 +1362,7 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals2; \ } \ } \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else if ( len==BCF_VL_G ) \ { \ @@ -1402,10 +1441,10 @@ static void merge_format_numeric(args_t *args, bcf1_t **lines, int nlines, bcf_f vals2 += nvals;\ }\ }\ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals*nsmpl); \ } \ else \ - bcf_update_format_##type(args->hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \ + bcf_update_format_##type(args->out_hdr,dst,tag,args->tmp_arr1,nvals_ori*nsmpl); \ } switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id)) { @@ -1422,7 +1461,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm if ( len!=BCF_VL_A && len!=BCF_VL_R && len!=BCF_VL_G ) { int nret = bcf_get_format_char(args->hdr,lines[0],tag,&args->tmp_arr1,&args->ntmp_arr1); - bcf_update_format_char(args->hdr,dst,tag,args->tmp_arr1,nret); + bcf_update_format_char(args->out_hdr,dst,tag,args->tmp_arr1,nret); return; } @@ -1534,7 +1573,7 @@ static void merge_format_string(args_t *args, bcf1_t **lines, int nlines, bcf_fm } args->ntmp_arr2 = str.m; args->tmp_arr2 = (uint8_t*)str.s; - bcf_update_format_char(args->hdr,dst,tag,str.s,str.l); + bcf_update_format_char(args->out_hdr,dst,tag,str.s,str.l); } char **merge_alleles(char **a, int na, int *map, char **b, int *nb, int *mb); // see vcfmerge.c @@ -1555,7 +1594,7 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * dst->qual = lines[i]->qual; } - bcf_update_id(args->hdr, dst, lines[0]->d.id); + bcf_update_id(args->out_hdr, dst, lines[0]->d.id); // Merge and set the alleles, create a mapping from source allele indexes to dst idxs hts_expand0(map_t,nlines,args->mmaps,args->maps); // a mapping for each line @@ -1569,20 +1608,20 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * } for (i=1; id.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->hdr, dst, lines[i]->d.id); + if (lines[i]->d.id[0]!='.' || lines[i]->d.id[1]) bcf_add_id(args->out_hdr, dst, lines[i]->d.id); args->maps[i].nals = lines[i]->n_allele; hts_expand(int,args->maps[i].nals,args->maps[i].mals,args->maps[i].map); args->als = merge_alleles(lines[i]->d.allele, lines[i]->n_allele, args->maps[i].map, args->als, &args->nals, &args->mals); if ( !args->als ) error("Failed to merge alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,dst),(int64_t) dst->pos+1); } - bcf_update_alleles(args->hdr, dst, (const char**)args->als, args->nals); + bcf_update_alleles(args->out_hdr, dst, (const char**)args->als, args->nals); for (i=0; inals; i++) { free(args->als[i]); args->als[i] = NULL; } - if ( lines[0]->d.n_flt ) bcf_update_filter(args->hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt); + if ( lines[0]->d.n_flt ) bcf_update_filter(args->out_hdr, dst, lines[0]->d.flt, lines[0]->d.n_flt); for (i=1; id.n_flt; j++) { @@ -1590,13 +1629,13 @@ static void merge_biallelics_to_multiallelic(args_t *args, bcf1_t *dst, bcf1_t * // otherwise accumulate FILTERs if (lines[i]->d.flt[j] == bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PASS")) { if (args->strict_filter) { - bcf_update_filter(args->hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt); + bcf_update_filter(args->out_hdr, dst, lines[i]->d.flt, lines[i]->d.n_flt); break; } else continue; } - bcf_add_filter(args->hdr, dst, lines[i]->d.flt[j]); + bcf_add_filter(args->out_hdr, dst, lines[i]->d.flt[j]); } } @@ -1766,7 +1805,7 @@ static void flush_buffer(args_t *args, htsFile *file, int n) if ( mrows_ready_to_flush(args, args->lines[k]) ) { while ( (line=mrows_flush(args)) ) - if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } int merge = 1; if ( args->mrows_collapse!=COLLAPSE_BOTH && args->mrows_collapse!=COLLAPSE_ANY ) @@ -1799,12 +1838,12 @@ static void flush_buffer(args_t *args, htsFile *file, int n) prev_type |= line_type; if ( args->rmdup & BCF_SR_PAIR_EXACT ) cmpals_add(&args->cmpals_out, args->lines[k]); } - if ( bcf_write1(file, args->hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, args->lines[k])!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } if ( args->mrows_op==MROWS_MERGE && !args->rbuf.n ) { while ( (line=mrows_flush(args)) ) - if ( bcf_write1(file, args->hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + if ( bcf_write1(file, args->out_hdr, line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); } } @@ -1819,6 +1858,10 @@ static void init_data(args_t *args) else args->keep_sum_ad = -1; + args->out_hdr = bcf_hdr_dup(args->hdr); + if ( args->old_rec_tag ) + bcf_hdr_printf(args->out_hdr,"##INFO=",args->old_rec_tag); + rbuf_init(&args->rbuf, 100); args->lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); if ( args->ref_fname ) @@ -1832,6 +1875,14 @@ static void init_data(args_t *args) args->tmp_str = (kstring_t*) calloc(bcf_hdr_nsamples(args->hdr),sizeof(kstring_t)); args->diploid = (uint8_t*) malloc(bcf_hdr_nsamples(args->hdr)); } + if ( args->atomize==SPLIT ) + { + args->abuf = abuf_init(args->hdr, SPLIT); + abuf_set_opt(args->abuf, bcf_hdr_t*, BCF_HDR, args->out_hdr); + if ( args->old_rec_tag ) + abuf_set_opt(args->abuf, const char*, INFO_TAG, args->old_rec_tag); + abuf_set_opt(args->abuf, int, STAR_ALLELE, args->use_star_allele); + } } static void destroy_data(args_t *args) @@ -1856,7 +1907,7 @@ static void destroy_data(args_t *args) for (i=0; intmp_als; i++) free(args->tmp_als[i].s); free(args->tmp_als); - free(args->tmp_als_str.s); + free(args->tmp_kstr.s); if ( args->tmp_str ) { for (i=0; ihdr); i++) free(args->tmp_str[i].s); @@ -1868,15 +1919,16 @@ static void destroy_data(args_t *args) free(args->tmp_arr1); free(args->tmp_arr2); free(args->diploid); + if ( args->abuf ) abuf_destroy(args->abuf); + bcf_hdr_destroy(args->out_hdr); if ( args->mrow_out ) bcf_destroy1(args->mrow_out); if ( args->fai ) fai_destroy(args->fai); if ( args->mseq ) free(args->seq); } -static void normalize_line(args_t *args, bcf1_t **line_ptr) +static void normalize_line(args_t *args, bcf1_t *line) { - bcf1_t *line = *line_ptr; if ( args->fai ) { if ( args->check_ref & CHECK_REF_FIX ) fix_ref(args, line); @@ -1906,8 +1958,8 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n+1,args->lines); int i,j; i = j = rbuf_append(&args->rbuf); - if ( !args->lines[i] ) args->lines[i] = bcf_init1(); - SWAP(bcf1_t*, (*line_ptr), args->lines[i]); + if ( args->lines[i] ) bcf_destroy(args->lines[i]); + args->lines[i] = bcf_dup(line); while ( rbuf_prev(&args->rbuf,&i) ) { if ( args->lines[i]->pos > args->lines[j]->pos ) SWAP(bcf1_t*, args->lines[i], args->lines[j]); @@ -1915,21 +1967,38 @@ static void normalize_line(args_t *args, bcf1_t **line_ptr) } } +static bcf1_t *next_atomized_line(args_t *args) +{ + bcf1_t *rec = NULL; + if ( args->atomize==SPLIT ) + { + rec = abuf_flush(args->abuf, 0); + if ( rec ) return rec; + } + + if ( !bcf_sr_next_line(args->files) ) return NULL; + + if ( args->atomize==SPLIT ) + { + abuf_push(args->abuf,bcf_sr_get_line(args->files,0)); + return abuf_flush(args->abuf, 0); + } + return bcf_sr_get_line(args->files,0); +} static void normalize_vcf(args_t *args) { - htsFile *out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); - if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); + args->out = hts_open(args->output_fname, hts_bcf_wmode2(args->output_type,args->output_fname)); + if ( args->out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) - hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p); - if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm"); - if ( bcf_hdr_write(out, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p); + if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_norm"); + if ( bcf_hdr_write(args->out, args->out_hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname); + bcf1_t *line; int prev_rid = -1, prev_pos = -1, prev_type = 0; - while ( bcf_sr_next_line(args->files) ) + while ( (line = next_atomized_line(args)) ) { args->ntotal++; - - bcf1_t *line = args->files->readers[0].buffer[0]; if ( args->rmdup ) { int line_type = bcf_get_variant_types(line); @@ -1953,7 +2022,7 @@ static void normalize_vcf(args_t *args) // still on the same chromosome? int i,j,ilast = rbuf_last(&args->rbuf); - if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, out, args->rbuf.n); // new chromosome + if ( ilast>=0 && line->rid != args->lines[ilast]->rid ) flush_buffer(args, args->out, args->rbuf.n); // new chromosome int split = 0; if ( args->mrows_op==MROWS_SPLIT ) @@ -1968,13 +2037,13 @@ static void normalize_vcf(args_t *args) args->nsplit++; split_multiallelic_to_biallelics(args, line); for (j=0; jntmp_lines; j++) - normalize_line(args, &args->tmp_lines[j]); + normalize_line(args, args->tmp_lines[j]); } else split = 0; } if ( !split ) - normalize_line(args, &args->files->readers[0].buffer[0]); + normalize_line(args, line); // find out how many sites to flush ilast = rbuf_last(&args->rbuf); @@ -1984,10 +2053,10 @@ static void normalize_vcf(args_t *args) if ( args->lines[ilast]->pos - args->lines[i]->pos < args->buf_win ) break; j++; } - if ( j>0 ) flush_buffer(args, out, j); + if ( j>0 ) flush_buffer(args, args->out, j); } - flush_buffer(args, out, args->rbuf.n); - if ( hts_close(out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); + flush_buffer(args, args->out, args->rbuf.n); + if ( hts_close(args->out)!=0 ) error("[%s] Error: close failed .. %s\n", __func__,args->output_fname); fprintf(stderr,"Lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", args->ntotal,args->nsplit,args->nchanged,args->nskipped); if ( args->check_ref & CHECK_REF_FIX ) @@ -2003,24 +2072,27 @@ static void usage(void) fprintf(stderr, "Usage: bcftools norm [options] \n"); fprintf(stderr, "\n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --check-ref check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); - fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n"); - fprintf(stderr, " -d, --rm-dup remove duplicate snps|indels|both|all|exact\n"); - fprintf(stderr, " -f, --fasta-ref reference sequence\n"); - fprintf(stderr, " --force try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); - fprintf(stderr, " --keep-sum keep vector sum constant when splitting multiallelics (see github issue #360)\n"); - fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); - fprintf(stderr, " --no-version do not append version and command line to the header\n"); - fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n"); - fprintf(stderr, " -o, --output write output to a file [standard output]\n"); - fprintf(stderr, " -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " -r, --regions restrict to comma-separated list of regions\n"); - fprintf(stderr, " -R, --regions-file restrict to regions listed in a file\n"); - fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n"); - fprintf(stderr, " -t, --targets similar to -r but streams rather than index-jumps\n"); - fprintf(stderr, " -T, --targets-file similar to -R but streams rather than index-jumps\n"); - fprintf(stderr, " --threads use multithreading with worker threads [0]\n"); - fprintf(stderr, " -w, --site-win buffer for sorting lines which changed position during realignment [1000]\n"); + fprintf(stderr, " -a, --atomize Decompose complex variants (e.g. MNVs become consecutive SNVs)\n"); + fprintf(stderr, " --atom-overlaps '*'|. Use the star allele (*) for overlapping alleles or set to missing (.) [*]\n"); + fprintf(stderr, " -c, --check-ref e|w|x|s Check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n"); + fprintf(stderr, " -D, --remove-duplicates Remove duplicate lines of the same type.\n"); + fprintf(stderr, " -d, --rm-dup TYPE Remove duplicate snps|indels|both|all|exact\n"); + fprintf(stderr, " -f, --fasta-ref FILE Reference sequence\n"); + fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); + fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); + fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(stderr, " --no-version Do not append version and command line to the header\n"); + fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); + fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); + fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); + fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); + fprintf(stderr, " -s, --strict-filter When merging (-m+), merged site is PASS only if all sites being merged PASS\n"); + fprintf(stderr, " -t, --targets REGION Similar to -r but streams rather than index-jumps\n"); + fprintf(stderr, " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"); + fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -w, --site-win INT Buffer for sorting lines which changed position during realignment [1000]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # normalize and left-align indels\n"); @@ -2048,11 +2120,15 @@ int main_vcfnorm(int argc, char *argv[]) args->do_indels = 1; int region_is_file = 0; int targets_is_file = 0; + args->use_star_allele = 1; static struct option loptions[] = { {"help",no_argument,NULL,'h'}, {"force",no_argument,NULL,7}, + {"atomize",no_argument,NULL,'a'}, + {"atom-overlaps",required_argument,NULL,11}, + {"old-rec-tag",required_argument,NULL,12}, {"keep-sum",required_argument,NULL,10}, {"fasta-ref",required_argument,NULL,'f'}, {"do-not-normalize",no_argument,NULL,'N'}, @@ -2073,7 +2149,7 @@ int main_vcfnorm(int argc, char *argv[]) {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sN",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hr:R:f:w:Dd:o:O:c:m:t:T:sNa",loptions,NULL)) >= 0) { switch (c) { case 10: // possibly generalize this also to INFO/AD and other tags @@ -2081,6 +2157,13 @@ int main_vcfnorm(int argc, char *argv[]) error("Error: only --keep-sum AD is currently supported. See https://github.com/samtools/bcftools/issues/360 for more.\n"); args->keep_sum_ad = 1; // this will be set to the header id or -1 in init_data break; + case 'a': args->atomize = SPLIT; break; + case 11 : + if ( optarg[0]=='*' ) args->use_star_allele = 1; + else if ( optarg[0]=='.' ) args->use_star_allele = 0; + else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n"); + break; + case 12 : args->old_rec_tag = optarg; break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; @@ -2152,7 +2235,7 @@ int main_vcfnorm(int argc, char *argv[]) } else fname = argv[optind]; - if ( !args->ref_fname && !args->mrows_op && !args->rmdup ) error("Expected -f, -m, -D or -d option\n"); + if ( !args->ref_fname && !args->mrows_op && !args->rmdup && args->atomize==NONE ) error("Expected -a, -f, -m, -D or -d option\n"); if ( !args->check_ref && args->ref_fname ) args->check_ref = CHECK_REF_EXIT; if ( args->check_ref && !args->ref_fname ) error("Expected --fasta-ref with --check-ref\n"); From d1f749477cd02afb4e38283038fc50bcda138110 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 25 Feb 2021 13:20:14 +0000 Subject: [PATCH 67/81] Prevent invalid memory access caused by forgotten break statement in csq_stage (but not in hap_stage_vcf). Fixes #1429. Possibly also #1433 and #1428 --- NEWS | 5 +++++ csq.c | 3 ++- doc/bcftools.txt | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 1141add3e..de8d796ef 100644 --- a/NEWS +++ b/NEWS @@ -96,6 +96,11 @@ Changes affecting specific commands: - `call -C trio` is temporarily disabled +* bcftools csq: + + - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too + many per-sample consequences + * bcftools +fill-tags: - MAF definition revised for multiallelic sites, the second most common diff --git a/csq.c b/csq.c index 1da646539..7a1a826a7 100644 --- a/csq.c +++ b/csq.c @@ -3749,6 +3749,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); args->ncsq_small_warned = 1; } + break; } if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); @@ -4060,7 +4061,7 @@ static const char *usage(void) " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" " -c, --custom-tag use this tag instead of the default BCSQ\n" " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq maximum number of consequences to consider per site [16]\n" + " -n, --ncsq maximum number of per-haplotype consequences to consider for each site [16]\n" " -p, --phase how to handle unphased heterozygous genotypes: [r]\n" " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" diff --git a/doc/bcftools.txt b/doc/bcftools.txt index a52afc863..4f6ac8260 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1119,7 +1119,7 @@ with the *-c* option). The latter is a bitmask of indexes to INFO/BCSQ, with interleaved haplotypes. See the usage examples below for using the %TBCSQ converter in *query* for extracting a more human readable form from this bitmask. The construction of the bitmask limits the number of consequences -that can be referenced in the FORMAT/BCSQ tags. By default this is 16, but +that can be referenced per sample in the FORMAT/BCSQ tags. By default this is 16, but if more are required, see the *--ncsq* option. The program requires on input a VCF/BCF file, the reference genome in fasta @@ -1202,7 +1202,7 @@ output VCF and are ignored for the prediction analysis. only one VCF record at a time *-n, --ncsq* 'INT':: - maximum number of consequences to consider per site. The INFO/BCSQ column includes + maximum number of per-haplotype consequences to consider for each site. The INFO/BCSQ column includes all consequences, but only the first 'INT' will be referenced by the FORMAT/BCSQ fields. The default value is 16 which corresponds to one integer per diploid sample. Note that increasing the value leads to increased memory and is rarely necessary. From 6f3f892618376abef380d02af8d577648b04109d Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 25 Feb 2021 13:49:13 +0000 Subject: [PATCH 68/81] Update man page and INSTALL instructions --- INSTALL | 2 +- doc/bcftools.1 | 76 ++++++++++++-- doc/bcftools.html | 254 ++++++++++++++++++++++++---------------------- 3 files changed, 202 insertions(+), 130 deletions(-) diff --git a/INSTALL b/INSTALL index 91ff53d31..9fffc9506 100644 --- a/INSTALL +++ b/INSTALL @@ -3,7 +3,7 @@ For the impatient The latest source code can be downloaded from github and compiled using: - git clone git://github.com/samtools/htslib.git + git clone --recurse-submodules git://github.com/samtools/htslib.git git clone git://github.com/samtools/bcftools.git cd bcftools # The following is optional: diff --git a/doc/bcftools.1 b/doc/bcftools.1 index 6cac6e4db..78711909a 100644 --- a/doc/bcftools.1 +++ b/doc/bcftools.1 @@ -1,13 +1,13 @@ '\" t .\" Title: bcftools .\" Author: [see the "AUTHORS" section] -.\" Generator: DocBook XSL Stylesheets vsnapshot -.\" Date: 2021-02-23 10:44 CET +.\" Generator: DocBook XSL Stylesheets v1.76.1 +.\" Date: 2021-02-25 13:46 GMT .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "BCFTOOLS" "1" "2021\-02\-23 10:44 CET" "\ \&" "\ \&" +.TH "BCFTOOLS" "1" "2021\-02\-25 13:46 GMT" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Define some portability stuff .\" ----------------------------------------------------------------- @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&. .SS "VERSION" .sp -This manual page was last updated \fB2021\-02\-23 10:44 CET\fR and refers to bcftools git version \fB1\&.2\-1248\-g3910e40+\fR\&. +This manual page was last updated \fB2021\-02\-25 13:46 GMT\fR and refers to bcftools git version \fB1\&.11\-85\-gd1f7494+\fR\&. .SS "BCF1" .sp The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&. @@ -70,6 +70,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBannotate\fR \&.\&. edit VCF files, add or remove annotations .RE @@ -82,6 +83,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBcall\fR \&.\&. SNP/indel calling (former "view") .RE @@ -94,6 +96,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBcnv\fR \&.\&. Copy Number Variation caller .RE @@ -106,6 +109,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBconcat\fR \&.\&. concatenate VCF/BCF files from the same set of samples .RE @@ -118,6 +122,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBconsensus\fR \&.\&. create consensus sequence by applying VCF variants .RE @@ -130,6 +135,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBconvert\fR \&.\&. convert VCF/BCF to other formats and back .RE @@ -142,6 +148,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBcsq\fR \&.\&. haplotype aware consequence caller .RE @@ -154,6 +161,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBfilter\fR \&.\&. filter VCF/BCF files using fixed thresholds .RE @@ -166,6 +174,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBgtcheck\fR \&.\&. check sample concordance, detect sample swaps and contamination .RE @@ -178,6 +187,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBindex\fR \&.\&. index VCF/BCF .RE @@ -190,6 +200,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBisec\fR \&.\&. intersections of VCF/BCF files .RE @@ -202,6 +213,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBmerge\fR \&.\&. merge VCF/BCF files files from non\-overlapping sample sets .RE @@ -214,6 +226,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBmpileup\fR \&.\&. multi\-way pileup producing genotype likelihoods .RE @@ -226,6 +239,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBnorm\fR \&.\&. normalize indels .RE @@ -238,6 +252,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBplugin\fR \&.\&. run user\-defined plugin .RE @@ -250,6 +265,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBpolysomy\fR \&.\&. detect contaminations and whole\-chromosome aberrations .RE @@ -262,6 +278,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBquery\fR \&.\&. transform VCF/BCF into user\-defined formats .RE @@ -274,6 +291,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBreheader\fR \&.\&. modify VCF/BCF header, change sample names .RE @@ -286,6 +304,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBroh\fR \&.\&. identify runs of homo/auto\-zygosity .RE @@ -298,6 +317,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBsort\fR \&.\&. sort VCF/BCF files .RE @@ -310,6 +330,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBstats\fR \&.\&. produce VCF/BCF stats (former vcfcheck) .RE @@ -322,6 +343,7 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F .sp -1 .IP \(bu 2.3 .\} + \fBview\fR \&.\&. subset, filter and convert VCF and BCF files .RE @@ -337,6 +359,7 @@ Some helper scripts are bundled with the bcftools code\&. .sp -1 .IP \(bu 2.3 .\} + \fBplot\-vcfstats\fR \&.\&. plots the output of \fBstats\fR @@ -1877,7 +1900,7 @@ bcftools convert \-c ID,CHROM,POS,AA \-s SampleName \-f 23andme\-ref\&.fa \-\-ts .sp Haplotype aware consequence predictor which correctly handles combined variants such as MNPs split over multiple VCF records, SNPs separated by an intron (but adjacent in the spliced transcript) or nearby frame\-shifting indels which in combination in fact are not frame\-shifting\&. .sp -The output VCF is annotated with INFO/BCSQ and FORMAT/BCSQ tag (configurable with the \fB\-c\fR option)\&. The latter is a bitmask of indexes to INFO/BCSQ, with interleaved haplotypes\&. See the usage examples below for using the %TBCSQ converter in \fBquery\fR for extracting a more human readable form from this bitmask\&. The construction of the bitmask limits the number of consequences that can be referenced in the FORMAT/BCSQ tags\&. By default this is 16, but if more are required, see the \fB\-\-ncsq\fR option\&. +The output VCF is annotated with INFO/BCSQ and FORMAT/BCSQ tag (configurable with the \fB\-c\fR option)\&. The latter is a bitmask of indexes to INFO/BCSQ, with interleaved haplotypes\&. See the usage examples below for using the %TBCSQ converter in \fBquery\fR for extracting a more human readable form from this bitmask\&. The construction of the bitmask limits the number of consequences that can be referenced per sample in the FORMAT/BCSQ tags\&. By default this is 16, but if more are required, see the \fB\-\-ncsq\fR option\&. .sp The program requires on input a VCF/BCF file, the reference genome in fasta format (\fB\-\-fasta\-ref\fR) and genomic features in the GFF3 format downloadable from the Ensembl website (\fB\-\-gff\-annot\fR), and outputs an annotated VCF/BCF file\&. Currently, only Ensembl GFF3 files are supported\&. .sp @@ -1975,7 +1998,7 @@ switch off haplotype\-aware calling, run localized predictions considering only .PP \fB\-n, \-\-ncsq\fR \fIINT\fR .RS 4 -maximum number of consequences to consider per site\&. The INFO/BCSQ column includes all consequences, but only the first +maximum number of per\-haplotype consequences to consider for each site\&. The INFO/BCSQ column includes all consequences, but only the first \fIINT\fR will be referenced by the FORMAT/BCSQ fields\&. The default value is 16 which corresponds to one integer per diploid sample\&. Note that increasing the value leads to increased memory and is rarely necessary\&. .RE @@ -2620,7 +2643,7 @@ Print a list of records which are present in A and B but not in C and D .RE .SS "bcftools merge [\fIOPTIONS\fR] \fIA\&.vcf\&.gz\fR \fIB\&.vcf\&.gz\fR [\&...]" .sp -Merge multiple VCF/BCF files from non\-overlapping sample sets to create one multi\-sample file\&. For example, when merging file \fIA\&.vcf\&.gz\fR containing samples \fIS1\fR, \fIS2\fR and \fIS3\fR and file \fIB\&.vcf\&.gz\fR containing samples \fIS3\fR and \fIS4\fR, the output file will contain four samples named \fIS1\fR, \fIS2\fR, \fIS3\fR, \fI2:S3\fR and \fIS4\fR\&. +Merge multiple VCF/BCF files from non\-overlapping sample sets to create one multi\-sample file\&. For example, when merging file \fIA\&.vcf\&.gz\fR containing samples \fIS1\fR, \fIS2\fR and \fIS3\fR and file \fIB\&.vcf\&.gz\fR containing samples \fIS3\fR and \fIS4\fR, the output file will contain five samples named \fIS1\fR, \fIS2\fR, \fIS3\fR, \fI2:S3\fR and \fIS4\fR\&. .sp Note that it is responsibility of the user to ensure that the sample names are unique across all files\&. If they are not, the program will exit with an error unless the option \fB\-\-force\-samples\fR is given\&. The sample names can be also given explicitly using the \fB\-\-print\-header\fR and \fB\-\-use\-header\fR options\&. .sp @@ -3114,9 +3137,17 @@ Call SNPs and short INDELs, then mark low quality sites and sites with the read .sp Left\-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows\&. Left\-alignment and normalization will only be applied if the \fB\-\-fasta\-ref\fR option is supplied\&. .PP -\fB\-a, \-\-atomize\fR \fI\&.\fR|\fI*\fR +\fB\-a, \-\-atomize\fR +.RS 4 +Decompose complex variants, e\&.g\&. split MNVs into consecutive SNVs\&. See also +\fB\-\-atom\-overlaps\fR +and +\fB\-\-old\-rec\-tag\fR\&. +.RE +.PP +\fB\-\-atom\-overlaps\fR \fI\&.\fR|\fI*\fR .RS 4 -Decompose complex variants (e\&.g\&. split MNVs into consecutive SNVs)\&. Alleles missing because of an overlapping variant can be set either to missing (\&.) or to the star alele (*), as recommended by the VCF specification\&. IMPORTANT: Note that asterisk is expaneded by shell and must be put in quotes or escaped by a backslash: +Alleles missing because of an overlapping variant can be set either to missing (\&.) or to the star alele (*), as recommended by the VCF specification\&. IMPORTANT: Note that asterisk is expaneded by shell and must be put in quotes or escaped by a backslash: .RE .sp .if n \{\ @@ -3220,6 +3251,11 @@ option will not turn on indel normalisation as the option normally implies .RE .PP +\fB\-\-old\-rec\-tag\fR \fISTR\fR +.RS 4 +Add INFO/STR annotation with the original record\&. The format of the annotation is CHROM|POS|REF|ALT|USED_ALT_IDX\&. +.RE +.PP \fB\-o, \-\-output\fR \fIFILE\fR .RS 4 see @@ -3661,6 +3697,28 @@ INFO/TYPE Number:\&. Type:String \&.\&. The record type (REF,SNP,MNP,INDEL,etc) .sp -1 .IP \(bu 2.3 .\} +FORMAT/VAF Number:A Type:Float \&.\&. The fraction of reads with the alternate allele, requires FORMAT/AD or ADF+ADR +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +.sp -1 +.IP \(bu 2.3 +.\} +FORMAT/VAF1 Number:1 Type:Float \&.\&. The same as FORMAT/VAF but for all alternate alleles cumulatively +.RE +.sp +.RS 4 +.ie n \{\ +\h'-04'\(bu\h'+03'\c +.\} +.el \{\ +.sp -1 +.IP \(bu 2.3 +.\} TAG=func(TAG) Number:1 Type:Integer \&.\&. Experimental support for user\-defined expressions such as "DP=sum(DP)" .RE .RE diff --git a/doc/bcftools.html b/doc/bcftools.html index 8923e0a10..499b6e94e 100644 --- a/doc/bcftools.html +++ b/doc/bcftools.html @@ -1,5 +1,6 @@ -bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant + +bcftools

Name

bcftools — utilities for variant calling and manipulating VCFs and BCFs.

Synopsis

bcftools [--version|--version-only] [--help] [COMMAND] [OPTIONS]

DESCRIPTION

BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.

Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF @@ -9,17 +10,17 @@ (Note that files with non-standard index names can be accessed as e.g. "bcftools view -r X:2928329 file.vcf.gz##idx##non-standard-index-name".)

BCFtools is designed to work on a stream. It regards an input file "-" as the standard input (stdin) and outputs to the standard output (stdout). Several -commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2021-02-23 10:44 CET and refers to bcftools git version 1.2-1248-g3910e40+.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not +commands can thus be combined with Unix pipes.

VERSION

This manual page was last updated 2021-02-25 13:46 GMT and refers to bcftools git version 1.11-85-gd1f7494+.

BCF1

The BCF1 format output by versions of samtools <= 0.1.19 is not compatible with this version of bcftools. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0.1.19 to convert to VCF, which can then be read by -this version of bcftools.

    samtools-0.1.19/bcftools/bcftools view file.bcf1 | bcftools view

VARIANT CALLING

See bcftools call for variant calling from the output of the +this version of bcftools.

    samtools-0.1.19/bcftools/bcftools view file.bcf1 | bcftools view

VARIANT CALLING

See bcftools call for variant calling from the output of the samtools mpileup command. In versions of samtools <= 0.1.19 calling was done with bcftools view. Users are now required to choose between the old samtools calling model (-c/--consensus-caller) and the new multiallelic calling model (-m/--multiallelic-caller). The multiallelic calling model -is recommended for most tasks.

LIST OF COMMANDS

For a full list of available commands, run bcftools without arguments. For a full -list of available options, run bcftools COMMAND without arguments.

  • +is recommended for most tasks.

LIST OF COMMANDS

For a full list of available commands, run bcftools without arguments. For a full +list of available options, run bcftools COMMAND without arguments.

  • annotate .. edit VCF files, add or remove annotations
  • call .. SNP/indel calling (former "view") @@ -63,10 +64,10 @@ stats .. produce VCF/BCF stats (former vcfcheck)
  • view .. subset, filter and convert VCF and BCF files -

LIST OF SCRIPTS

Some helper scripts are bundled with the bcftools code.

  • +

LIST OF SCRIPTS

Some helper scripts are bundled with the bcftools code.

COMMANDS AND OPTIONS

Common Options

The following options are common to many bcftools commands. See usage for -specific commands to see if they apply.

+

COMMANDS AND OPTIONS

Common Options

The following options are common to many bcftools commands. See usage for +specific commands to see if they apply.

FILE
Files can be both VCF or BCF, uncompressed or BGZF-compressed. The file "-" @@ -82,7 +83,7 @@ matching positions (bcftools isec -c all), or only sites with matching variant type (bcftools isec -c snps  -c indels), or only sites with all alleles identical (bcftools isec -c none). -

+

none
only records with identical REF and ALT alleles are compatible @@ -180,7 +181,7 @@ option is used; see bcftools view documentation). To use updated tags for the subset in another command one can pipe from view into that command. For example: -
    bcftools view -Ou -s sample1,sample2 file.vcf | bcftools query -f %INFO/AC\t%INFO/AN\n
+
    bcftools view -Ou -s sample1,sample2 file.vcf | bcftools query -f %INFO/AC\t%INFO/AN\n
-S, --samples-file FILE
File of sample names to include or exclude if prefixed with "^". @@ -197,7 +198,7 @@ sample3 F

If the second column is not present, the sex "F" is assumed. With bcftools call -C trio, PED file is expected. The program ignores the first column and the last indicates sex (1=male, 2=female), for example:

    ignored_column  daughterA fatherA  motherA  2
-    ignored_column  sonB      fatherB  motherB  1
+ ignored_column sonB fatherB motherB 1
-t, --targets [^]chr|chr:pos|chr:from-to|chr:from-[,…]
Similar as -r, --regions, but the next position is accessed by streaming the @@ -221,12 +222,12 @@ be comma-separated list of alleles, starting with the reference allele. Note that the file must be compressed and index. Such a file can be easily created from a VCF using: -
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' file.vcf | bgzip -c > als.tsv.gz && tabix -s1 -b2 -e2 als.tsv.gz
+
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' file.vcf | bgzip -c > als.tsv.gz && tabix -s1 -b2 -e2 als.tsv.gz
--threads INT
Use multithreading with INT worker threads. The option is currently used only for the compression of the output stream, only when --output-type is b or z. Default: 0. -

bcftools annotate [OPTIONS] FILE

Add or remove annotations.

+

bcftools annotate [OPTIONS] FILE

Add or remove annotations.

-a, --annotations file
Bgzip-compressed and tabix-indexed file with annotations. The file @@ -250,7 +251,7 @@
    # Sample annotation file with columns CHROM, POS, STRING_TAG, NUMERIC_TAG
     1  752566  SomeString      5
     1  798959  SomeOtherString 6
-    # etc.
+ # etc.
--collapse snps|indels|both|all|some|none
Controls how to match records from the annotation file to the target VCF. @@ -308,14 +309,14 @@
Lines to append to the VCF header, see also -c, --columns and -a, --annotations. For example:
    ##INFO=<ID=NUMERIC_TAG,Number=1,Type=Integer,Description="Example header line">
-    ##INFO=<ID=STRING_TAG,Number=1,Type=String,Description="Yet another header line">
+ ##INFO=<ID=STRING_TAG,Number=1,Type=String,Description="Yet another header line">
-I, --set-id [+]FORMAT
assign ID on the fly. The format is the same as in the query command (see below). By default all existing IDs are replaced. If the format string is preceded by "+", only missing IDs will be set. For example, one can use -
    bcftools annotate --set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' file.vcf
+
    bcftools annotate --set-id +'%CHROM\_%POS\_%REF\_%FIRST_ALT' file.vcf
-i, --include EXPRESSION
include only sites for which EXPRESSION is true. For valid expressions see @@ -431,10 +432,10 @@ # Annotate from a bed file (0-based coordinates, half-closed, half-open intervals) bcftools annotate -a annots.bed.gz -h annots.hdr -c CHROM,FROM,TO,TAG input.vcf - # For more examples see http://samtools.github.io/bcftools/howtos/annotate.html

bcftools call [OPTIONS] FILE

This command replaces the former bcftools view caller. Some of the original + # For more examples see http://samtools.github.io/bcftools/howtos/annotate.html

bcftools call [OPTIONS] FILE

This command replaces the former bcftools view caller. Some of the original functionality has been temporarily lost in the process of transition under htslib, but will be added back on popular -demand. The original calling model can be invoked with the -c option.

File format options:

+demand. The original calling model can be invoked with the -c option.

File format options:

--no-version
see Common Options @@ -467,7 +468,7 @@ MT 1 16569 M 1 MT 1 16569 F 1 * * * M 2 - * * * F 2
+ * * * F 2
-r, --regions chr|chr:pos|chr:from-to|chr:from-[,…]
see Common Options @@ -495,7 +496,7 @@ --threads INT
see Common Options -

Input/output options:

+

Input/output options:

-A, --keep-alts
output all alternate alleles present in the alignments even if they do not @@ -525,7 +526,7 @@ ##INFO=<ID=REF_AC,Number=A,Type=Integer,Description="Allele count in reference genotypes for each ALT allele"> # Now before calling, stream the raw mpileup output through `bcftools annotate` to add the frequencies - bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]
+ bcftools mpileup [...] -Ou | bcftools annotate -a AFs.tab.gz -h AFs.hdr -c CHROM,POS,REF,ALT,REF_AN,REF_AC -Ou | bcftools call -mv -F REF_AN,REF_AC [...]
-G, --group-samples FILE|-
by default, all samples are assumed to come from a single population. This option allows to group samples @@ -556,13 +557,13 @@ -v, --variants-only
output variant sites only -

Consensus/variant calling options:

+

Consensus/variant calling options:

-c, --consensus-caller
the original samtools/bcftools calling method (conflicts with -m)
-C, --constrain alleles|trio -
+
alleles
call genotypes given alleles. See also -T, --targets-file. @@ -610,10 +611,10 @@ -Y, --chromosome-Y
haploid output for males and skips females (requires PED file with -s) -

bcftools cnv [OPTIONS] FILE

Copy number variation caller, requires a VCF annotated with the Illumina’s +

bcftools cnv [OPTIONS] FILE

Copy number variation caller, requires a VCF annotated with the Illumina’s B-allele frequency (BAF) and Log R Ratio intensity (LRR) values. The HMM considers the following copy number states: CN 2 (normal), 1 (single-copy -loss), 0 (complete loss), 3 (single-copy gain).

General Options:

+loss), 0 (complete loss), 3 (single-copy gain).

General Options:

-c, --control-sample string
optional control sample name. If given, pairwise calling is performed @@ -652,7 +653,7 @@ -T, --targets-file FILE
see Common Options -

HMM Options:

+

HMM Options:

-a, --aberrant float[,float]
fraction of aberrant cells in query and control. The hallmark of @@ -701,13 +702,13 @@
the HMM probability of transition to another copy number state. Increasing this values leads to smaller and more frequent calls. -

bcftools concat [OPTIONS] FILE1 FILE2 […]

Concatenate or combine VCF/BCF files. All source files must have the same sample +

bcftools concat [OPTIONS] FILE1 FILE2 […]

Concatenate or combine VCF/BCF files. All source files must have the same sample columns appearing in the same order. Can be used, for example, to concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel VCF into one. The input files must be sorted by chr and position. The files must be given in the correct order to produce sorted VCF on output unless the -a, --allow-overlaps option is specified. With the --naive option, the files -are concatenated without being recompressed, which is very fast..

+are concatenated without being recompressed, which is very fast..

-a, --allow-overlaps
First coordinate of the next file can precede last record of the current file. @@ -774,13 +775,13 @@ --threads INT
see Common Options -

bcftools consensus [OPTIONS] FILE

Create consensus sequence by applying VCF variants to a reference fasta file. +

bcftools consensus [OPTIONS] FILE

Create consensus sequence by applying VCF variants to a reference fasta file. By default, the program will apply all ALT variants to the reference fasta to obtain the consensus sequence. Using the --sample (and, optionally, --haplotype) option will apply genotype (haplotype) calls from FORMAT/GT. Note that the program does not act as a primitive variant caller and ignores allelic depth information, such as INFO/AD or FORMAT/AD. For that, consider using the -setGT plugin.

+setGT plugin.

-c, --chain FILE
write a chain file for liftover @@ -797,7 +798,7 @@ -H, --haplotype 1|2|R|A|I|LR|LA|SR|SA|1pIu|2pIu

choose which allele from the FORMAT/GT field to use (the codes are case-insensitive): -

+

1
the first allele, regardless of phasing @@ -878,7 +879,7 @@ # Create consensus for one region. The fasta header lines are then expected # in the form ">chr:from-to". - samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz -o out.fa

bcftools convert [OPTIONS] FILE

VCF input options:

+ samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz -o out.fa

bcftools convert [OPTIONS] FILE

VCF input options:

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -912,7 +913,7 @@ -T, --targets-file FILE
see Common Options -

VCF output options:

+

VCF output options:

--no-version
see Common Options @@ -928,7 +929,7 @@ --threads INT
see Common Options -

GEN/SAMPLE conversion:

+

GEN/SAMPLE conversion:

-G, --gensample2vcf prefix or gen-file,sample-file
convert IMPUTE2 output to VCF. The second column must be of the form @@ -953,7 +954,7 @@ ID_1 ID_2 missing 0 0 0 sample1 sample1 0 - sample2 sample2 0
+ sample2 sample2 0
--tag STRING
tag to take values for .gen file: GT,PL,GL,GP @@ -966,11 +967,11 @@
output sex column in the sample file. The FILE format is
    MaleSample    M
-    FemaleSample  F
+ FemaleSample F
--vcf-ids
output VCF IDs in the second column instead of CHROM:POS_REF_ALT -

gVCF conversion:

+

gVCF conversion:

--gvcf2vcf
convert gVCF to VCF, expanding REF blocks into sites. Note that @@ -982,7 +983,7 @@ -f, --fasta-ref file
reference sequence in fasta format. Must be indexed with samtools faidx -

HAP/SAMPLE conversion:

+

HAP/SAMPLE conversion:

--hapsample2vcf prefix or hap-file,sample-file
convert from hap/sample format to VCF. The columns of .hap file are @@ -994,7 +995,7 @@ ---- 1:111485207_G_A rsID1 111485207 G A 0 1 0 0 1:111494194_C_T rsID2 111494194 C T 0 1 0 0 - 1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0
+ 1:111495231_A_<DEL>_111495784 rsID3 111495231 A <DEL> 0 0 1 0
--hapsample prefix or hap-file,sample-file
convert from VCF to hap/sample format used by IMPUTE2 and SHAPEIT. @@ -1013,11 +1014,11 @@
output sex column in the sample file. The FILE format is
    MaleSample    M
-    FemaleSample  F
+ FemaleSample F
--vcf-ids
output VCF IDs instead of "CHROM:POS_REF_ALT" IDs -

HAP/LEGEND/SAMPLE conversion:

+

HAP/LEGEND/SAMPLE conversion:

-H, --haplegendsample2vcf prefix or hap-file,legend-file,sample-file
convert from hap/legend/sample format used by IMPUTE2 to VCF, see @@ -1045,7 +1046,7 @@ ------- sample population group sex sample1 sample1 sample1 2 - sample2 sample2 sample2 2
+ sample2 sample2 sample2 2
--haploid2diploid
with -h option converts haploid genotypes to homozygous diploid @@ -1057,11 +1058,11 @@
output sex column in the sample file. The FILE format is
    MaleSample    M
-    FemaleSample  F
+ FemaleSample F
--vcf-ids
output VCF IDs instead of "CHROM:POS_REF_ALT" IDs -

TSV conversion:

+

TSV conversion:

--tsv2vcf file
convert from TSV (tab-separated values) format (such as generated by @@ -1090,7 +1091,7 @@
file of sample names. See Common Options

Example:

# Convert 23andme results into VCF
-bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz

bcftools csq [OPTIONS] FILE

Haplotype aware consequence predictor which correctly handles combined +bcftools convert -c ID,CHROM,POS,AA -s SampleName -f 23andme-ref.fa --tsv2vcf 23andme.txt -Oz -o out.vcf.gz

bcftools csq [OPTIONS] FILE

Haplotype aware consequence predictor which correctly handles combined variants such as MNPs split over multiple VCF records, SNPs separated by an intron (but adjacent in the spliced transcript) or nearby frame-shifting indels which in combination in fact are not frame-shifting.

The output VCF is annotated with INFO/BCSQ and FORMAT/BCSQ tag (configurable @@ -1098,7 +1099,7 @@ interleaved haplotypes. See the usage examples below for using the %TBCSQ converter in query for extracting a more human readable form from this bitmask. The construction of the bitmask limits the number of consequences -that can be referenced in the FORMAT/BCSQ tags. By default this is 16, but +that can be referenced per sample in the FORMAT/BCSQ tags. By default this is 16, but if more are required, see the --ncsq option.

The program requires on input a VCF/BCF file, the reference genome in fasta format (--fasta-ref) and genomic features in the GFF3 format downloadable from the Ensembl website (--gff-annot), and outputs an annotated VCF/BCF @@ -1108,7 +1109,7 @@ with the --local-csq option.

If conflicting (overlapping) variants within one haplotype are detected, a warning will be emitted and predictions will be based on only the first variant in the analysis.

Symbolic alleles are not supported. They will remain unannotated in the -output VCF and are ignored for the prediction analysis.

+output VCF and are ignored for the prediction analysis.

-c, --custom-tag STRING
use this custom tag to store consequences rather than the default BCSQ tag @@ -1164,7 +1165,7 @@ 1 ignored_field three_prime_UTR 21 2054 . - . Parent=transcript:TranscriptId 1 ignored_field exon 21 2148 . - . Parent=transcript:TranscriptId 1 ignored_field CDS 21 2148 . - 1 Parent=transcript:TranscriptId - 1 ignored_field five_prime_UTR 210 2148 . - . Parent=transcript:TranscriptId
+ 1 ignored_field five_prime_UTR 210 2148 . - . Parent=transcript:TranscriptId
-i, --include EXPRESSION
include only sites for which EXPRESSION is true. For valid expressions see @@ -1177,7 +1178,7 @@
-n, --ncsq INT
- maximum number of consequences to consider per site. The INFO/BCSQ column includes + maximum number of per-haplotype consequences to consider for each site. The INFO/BCSQ column includes all consequences, but only the first INT will be referenced by the FORMAT/BCSQ fields. The default value is 16 which corresponds to one integer per diploid sample. Note that increasing the value leads to increased memory and is rarely necessary. @@ -1198,7 +1199,7 @@ -p, --phase a|m|r|R|s

how to handle unphased heterozygous genotypes: -

+

a
take GTs as is, create haplotypes regardless of phase (0/1 → 0|1) @@ -1277,7 +1278,7 @@ BCSQ=stop_gained|C2orf83|ENST00000264387|-|141W>141*|228476140C>T # The consequence type of a variant downstream from a stop are prefixed with * - BCSQ=*missense|PER3|ENST00000361923|+|1028M>1028T|7890117T>C

bcftools filter [OPTIONS] FILE

Apply fixed-threshold filters.

+ BCSQ=*missense|PER3|ENST00000361923|+|1028M>1028T|7890117T>C

bcftools filter [OPTIONS] FILE

Apply fixed-threshold filters.

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -1295,7 +1296,7 @@ Here the positions 1 and 6 are filtered, 0 and 7 are not: 0123-456789 ref .G.G-..G.. - ins .A.GT..A..
+ ins .A.GT..A..
-G, --IndelGap INT
filter clusters of indels separated by INT or fewer base pairs allowing @@ -1308,7 +1309,7 @@ And similarly here, the second is filtered: 01 23 456 78 ref .A-.A-..A-.. - ins .AT.AT..AT..
+ ins .AT.AT..AT..
-i, --include EXPRESSION
include only sites for which EXPRESSION is true. For valid expressions see @@ -1363,10 +1364,10 @@ --threads INT
see Common Options -

bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz

Checks sample identity. The program can operate in two modes. If the -g +

bcftools gtcheck [OPTIONS] [-g genotypes.vcf.gz] query.vcf.gz

Checks sample identity. The program can operate in two modes. If the -g option is given, the identity of samples from query.vcf.gz is checked against the samples in the -g file. -Without the -g option, multi-sample cross-check of samples in query.vcf.gz is performed.

+Without the -g option, multi-sample cross-check of samples in query.vcf.gz is performed.

--distinctive-sites NUM[,MEM[,DIR]]
Find sites that can distinguish between at least NUM sample pairs. If the number is smaller or equal to 1, @@ -1429,7 +1430,7 @@ List of query samples or -g samples. If neither -s nor -S are given, all possible sample pair combinations are compared

-S, --samples-file [qry|gt]:'FILE' File with the query or -g samples to compare. If neither -s nor -S are given, all possible sample - pair combinations are compared

+ pair combinations are compared

-t, --targets file
see Common Options @@ -1450,13 +1451,13 @@ bcftools gtcheck -s gt:a1,a2,a3 -s qry:b1,b2 -g A.bcf B.bcf # Compare only two pairs a1,b1 and a1,b2 - bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf

bcftools index [OPTIONS] in.bcf|in.vcf.gz

Creates index for bgzip compressed VCF/BCF files for random access. CSI + bcftools gtcheck -p a1,b1,a1,b2 -g A.bcf B.bcf

bcftools index [OPTIONS] in.bcf|in.vcf.gz

Creates index for bgzip compressed VCF/BCF files for random access. CSI (coordinate-sorted index) is created by default. The CSI format supports indexing of chromosomes up to length 2^31. TBI (tabix index) index files, which support chromosome lengths up to 2^29, can be created by using the -t/--tbi option or using the tabix program packaged with htslib. When loading an index file, bcftools will try -the CSI first and then the TBI.

Indexing options:

+the CSI first and then the TBI.

Indexing options:

-c, --csi
generate CSI-format index for VCF/BCF files [default] @@ -1481,7 +1482,7 @@ --threads INT
see Common Options -

Stats options:

+

Stats options:

-n, --nrecords
print the number of records based on the CSI or TBI index files @@ -1492,10 +1493,10 @@ Output format is three tab-delimited columns listing the contig name, contig length (. if unknown) and number of records for the contig. Contigs with zero records are not printed. -

bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz […]

Creates intersections, unions and complements of VCF files. Depending +

bcftools isec [OPTIONS] A.vcf.gz B.vcf.gz […]

Creates intersections, unions and complements of VCF files. Depending on the options, the program can output records from one (or more) files which have (or do not have) corresponding records with the same position -in the other files.

+in the other files.

-c, --collapse snps|indels|both|all|some|none
see Common Options @@ -1560,17 +1561,17 @@
list of input files to output given as 1-based indices. With -p and no -w, all files are written. -

Examples:

Create intersection and complements of two sets saving the output in dir/*

    bcftools isec -p dir A.vcf.gz B.vcf.gz

Filter sites in A (require INFO/MAF>=0.01) and B (require INFO/dbSNP) but not in C, +

Examples:

Create intersection and complements of two sets saving the output in dir/*

    bcftools isec -p dir A.vcf.gz B.vcf.gz

Filter sites in A (require INFO/MAF>=0.01) and B (require INFO/dbSNP) but not in C, and create an intersection, including only sites which appear in at least two of -the files after filters have been applied

    bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -n +2 -p dir

Extract and write records from A shared by both A and B using exact allele match

    bcftools isec -p dir -n=2 -w1 A.vcf.gz B.vcf.gz

Extract records private to A or B comparing by position only

    bcftools isec -p dir -n-1 -c all A.vcf.gz B.vcf.gz

Print a list of records which are present in A and B but not in C and D

    bcftools isec -n~1100 -c all A.vcf.gz B.vcf.gz C.vcf.gz D.vcf.gz

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz […]

Merge multiple VCF/BCF files from non-overlapping sample sets to create one +the files after filters have been applied

    bcftools isec -e'MAF<0.01' -i'dbSNP=1' -e- A.vcf.gz B.vcf.gz C.vcf.gz -n +2 -p dir

Extract and write records from A shared by both A and B using exact allele match

    bcftools isec -p dir -n=2 -w1 A.vcf.gz B.vcf.gz

Extract records private to A or B comparing by position only

    bcftools isec -p dir -n-1 -c all A.vcf.gz B.vcf.gz

Print a list of records which are present in A and B but not in C and D

    bcftools isec -n~1100 -c all A.vcf.gz B.vcf.gz C.vcf.gz D.vcf.gz

bcftools merge [OPTIONS] A.vcf.gz B.vcf.gz […]

Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file. For example, when merging file A.vcf.gz containing samples S1, S2 and S3 and file B.vcf.gz containing samples S3 and -S4, the output file will contain four samples named S1, S2, S3, 2:S3 +S4, the output file will contain five samples named S1, S2, S3, 2:S3 and S4.

Note that it is responsibility of the user to ensure that the sample names are unique across all files. If they are not, the program will exit with an error unless the option --force-samples is given. The sample names can be also given explicitly using the --print-header and --use-header options.

Note that only records from different files can be merged, never from the same file. -For "vertical" merge take a look at bcftools concat or bcftools norm -m instead.

+For "vertical" merge take a look at bcftools concat or bcftools norm -m instead.

--force-samples
if the merged files contain duplicate samples names, proceed anyway. @@ -1640,7 +1641,7 @@ -m indels .. allow multiallelic indel records -m both .. both SNP and indel records can be multiallelic -m all .. SNP records can be merged with indel records --m id .. merge by ID
+-m id .. merge by ID
--no-index
the option allows to merge files without indexing them first. In order for this @@ -1670,7 +1671,7 @@ --threads INT
see Common Options -

bcftools mpileup [OPTIONS] -f ref.fa in.bam [in2.bam […]]

Generate VCF or BCF containing genotype likelihoods for one or multiple +

bcftools mpileup [OPTIONS] -f ref.fa in.bam [in2.bam […]]

Generate VCF or BCF containing genotype likelihoods for one or multiple alignment (BAM or CRAM) files. This is based on the original samtools mpileup command (with the -v or -g options) producing genotype likelihoods in VCF or BCF format, but not the textual pileup @@ -1689,7 +1690,7 @@ index is used to find chromosome 20 and then it is filtered for the regions listed in the BED file. Also note that the -r option can be much slower than -t with many regions and can require more memory when -multiple regions and many alignment files are processed.

Input options

+multiple regions and many alignment files are processed.

Input options

-6, --illumina1.3+
Assume the quality is in the Illumina 1.3+ encoding. @@ -1761,7 +1762,7 @@ RG_ID_5 FILE_1.bam SAMPLE_A RG_ID_6 FILE_2.bam SAMPLE_A * FILE_3.bam SAMPLE_C - ? FILE_3.bam SAMPLE_D
+ ? FILE_3.bam SAMPLE_D
-q, -min-MQ INT
Minimum mapping quality for an alignment to be used [0] @@ -1817,7 +1818,7 @@ -x, --ignore-overlaps
Disable read-pair overlap detection. -

Output options

+

Output options

-a, --annotate LIST
Comma-separated list of FORMAT and INFO tags to output. (case-insensitive, @@ -1839,7 +1840,7 @@ FORMAT/DP4 .. Deprecated in favor of FORMAT/ADF and FORMAT/ADR; Number of high-quality ref-forward, ref-reverse, alt-forward and alt-reverse bases (Number=4,Type=Integer) FORMAT/DPR .. Deprecated in favor of FORMAT/AD; Number of high-quality bases for each observed allele (Number=R,Type=Integer) -INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for each observed allele (Number=R,Type=Integer)
+INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for each observed allele (Number=R,Type=Integer)
-g, --gvcf INT[,…]
output gVCF blocks of homozygous REF calls, with depth (DP) ranges @@ -1870,7 +1871,7 @@ --threads INT
see Common Options -

Options for SNP/INDEL genotype likelihood computation

+

Options for SNP/INDEL genotype likelihood computation

-e, --ext-prob INT
Phred-scaled gap extension sequencing error probability. Reducing INT @@ -1915,7 +1916,7 @@ indel candidates are obtained. It is recommended to collect indel candidates from sequencing technologies that have low indel error rate such as ILLUMINA [all] -

Examples:

Call SNPs and short INDELs, then mark low quality sites and sites with the read +

Examples:

Call SNPs and short INDELs, then mark low quality sites and sites with the read depth exceeding a limit. (The read depth should be adjusted to about twice the average read depth as higher read depths usually indicate problematic regions which are often enriched for artefacts.) One may consider to add -C50 to @@ -1923,13 +1924,17 @@ mismatches. Applying this option usually helps for BWA-backtrack alignments, but may not other aligners.

    bcftools mpileup -Ou -f ref.fa aln.bam | \
     bcftools call -Ou -mv | \
-    bcftools filter -s LowQual -e '%QUAL<20 || DP>100' > var.flt.vcf

bcftools norm [OPTIONS] file.vcf.gz

Left-align and normalize indels, check if REF alleles match the reference, + bcftools filter -s LowQual -e '%QUAL<20 || DP>100' > var.flt.vcf

bcftools norm [OPTIONS] file.vcf.gz

Left-align and normalize indels, check if REF alleles match the reference, split multiallelic sites into multiple rows; recover multiallelics from multiple rows. Left-alignment and normalization will only be applied if -the --fasta-ref option is supplied.

--a, --atomize .|* +the --fasta-ref option is supplied.

+-a, --atomize +
+ Decompose complex variants, e.g. split MNVs into consecutive SNVs. + See also --atom-overlaps and --old-rec-tag. +
+--atom-overlaps .|*
- Decompose complex variants (e.g. split MNVs into consecutive SNVs). Alleles missing because of an overlapping variant can be set either to missing (.) or to the star alele (*), as recommended by the VCF specification. IMPORTANT: Note that asterisk is expaneded @@ -1948,7 +1953,7 @@ # bcftools norm -a \* 100 C G,* 2/1 100 CC C,* 1/2 - 101 C G,* 2/1
+ 101 C G,* 2/1
-c, --check-ref e|w|x|s
what to do when incorrect or missing REF allele is encountered: @@ -2007,6 +2012,11 @@ reference -f. The -N option will not turn on indel normalisation as the -f option normally implies
+--old-rec-tag STR +
+ Add INFO/STR annotation with the original record. The format of the + annotation is CHROM|POS|REF|ALT|USED_ALT_IDX. +
-o, --output FILE
see Common Options @@ -2043,13 +2053,13 @@
maximum distance between two records to consider when locally sorting variants which changed position during the realignment -

bcftools [plugin NAME|+NAME] [OPTIONS] FILE — [PLUGIN OPTIONS]

A common framework for various utilities. The plugins can be used +

bcftools [plugin NAME|+NAME] [OPTIONS] FILE — [PLUGIN OPTIONS]

A common framework for various utilities. The plugins can be used the same way as normal commands only their name is prefixed with "+". Most plugins accept two types of parameters: general options shared by all plugins followed by a separator, and a list of plugin-specific options. There are some exceptions to this rule, some plugins do not accept the common options and implement their own parameters. Therefore please pay attention to -the usage examples that each plugin comes with.

VCF input options:

+the usage examples that each plugin comes with.

VCF input options:

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -2075,7 +2085,7 @@ -T, --targets-file file
see Common Options -

VCF output options:

+

VCF output options:

--no-version
see Common Options @@ -2091,7 +2101,7 @@ --threads INT
see Common Options -

Plugin options:

+

Plugin options:

-h, --help
list plugin’s options @@ -2112,7 +2122,7 @@ -V, --version
print version string and exit -

List of plugins coming with the distribution:

+

List of plugins coming with the distribution:

ad-bias
find positions with wildly varying ALT allele frequency (Fisher test on FMT/AD) @@ -2145,7 +2155,7 @@

runs a basic association test, per-site or in a region, and checks for novel alleles and genotypes in two groups of samples. Adds the following INFO annotations: -

  • +

    • PASSOC .. Fisher’s exact test probability of genotypic association (REF vs non-REF allele)
    • FASSOC .. proportion of non-REF allele in controls and cases @@ -2172,7 +2182,7 @@ fill-tags

      set various INFO tags. The list of tags supported in this version: -

      • +

        • INFO/AC Number:A Type:Integer .. Allele count in genotypes
        • INFO/AC_Hom Number:A Type:Integer .. Allele counts in homozygous genotypes @@ -2199,6 +2209,10 @@
        • INFO/TYPE Number:. Type:String .. The record type (REF,SNP,MNP,INDEL,etc)
        • +FORMAT/VAF Number:A Type:Float .. The fraction of reads with the alternate allele, requires FORMAT/AD or ADF+ADR +
        • +FORMAT/VAF1 Number:1 Type:Float .. The same as FORMAT/VAF but for all alternate alleles cumulatively +
        • TAG=func(TAG) Number:1 Type:Integer .. Experimental support for user-defined expressions such as "DP=sum(DP)"
      fix-ploidy @@ -2307,7 +2321,7 @@ variantkey-hex
      generate unsorted VariantKey-RSid index files in hexadecimal format -

Examples:

# List options common to all plugins
+

Examples:

# List options common to all plugins
 bcftools plugin
 
 # List available plugins
@@ -2332,11 +2346,11 @@
 bcftools +missing2ref in.vcf
 
 # Replace missing genotypes with 0|0
-bcftools +missing2ref in.vcf -- -p

Plugins troubleshooting:

Things to check if your plugin does not show up in the bcftools plugin -l output:

  • +bcftools +missing2ref in.vcf -- -p

Plugins troubleshooting:

Things to check if your plugin does not show up in the bcftools plugin -l output:

  • Run with the -v option for verbose output: bcftools plugin -lv
  • Does the environment variable BCFTOOLS_PLUGINS include the correct path? -

Plugins API:

// Short description used by 'bcftools plugin -l'
+

Plugins API:

// Short description used by 'bcftools plugin -l'
 const char *about(void);
 
 // Longer description used by 'bcftools +name -h'
@@ -2351,10 +2365,10 @@
 bcf1_t *process(bcf1_t *rec);
 
 // Called after all lines have been processed to clean up
-void destroy(void);

bcftools polysomy [OPTIONS] file.vcf.gz

Detect number of chromosomal copies in VCFs annotates with the Illumina’s +void destroy(void);

bcftools polysomy [OPTIONS] file.vcf.gz

Detect number of chromosomal copies in VCFs annotates with the Illumina’s B-allele frequency (BAF) values. Note that this command is not compiled in by default, see the section Optional Compilation with GSL in the INSTALL -file for help.

General options:

+file for help.

General options:

-o, --output-dir path
output directory @@ -2383,7 +2397,7 @@
verbose debugging output which gives hints about the thresholds and decisions made by the program. Note that the exact output can change between versions. -

Algorithm options:

+

Algorithm options:

-b, --peak-size float
the minimum peak size considered as a good match can be from the interval [0,1] @@ -2415,7 +2429,7 @@
a heuristics to filter failed fits where the expected peak symmetry is violated. The float is from the interval [0,1] and larger is stricter -

bcftools query [OPTIONS] file.vcf.gz [file.vcf.gz […]]

Extracts fields from VCF or BCF files and outputs them in user-defined format.

+

bcftools query [OPTIONS] file.vcf.gz [file.vcf.gz […]]

Extracts fields from VCF or BCF files and outputs them in user-defined format.

-e, --exclude EXPRESSION
exclude sites for which EXPRESSION is true. For valid expressions see @@ -2474,7 +2488,7 @@ -v, --vcf-list FILE
process multiple VCFs listed in the file -

Format:

%CHROM          The CHROM column (similarly also other columns: POS, ID, REF, ALT, QUAL, FILTER)
+

Format:

%CHROM          The CHROM column (similarly also other columns: POS, ID, REF, ALT, QUAL, FILTER)
 %END            End position of the REF allele
 %END0           End position of the REF allele in 0-based coordinates
 %FIRST_ALT      Alias for %ALT{0}
@@ -2495,7 +2509,7 @@
 %TYPE           Variant type (REF, SNP, MNP, INDEL, BND, OTHER)
 []              Format fields must be enclosed in brackets to loop over all samples
 \n              new line
-\t              tab character
Everything else is printed verbatim.

Examples:

# Print chromosome, position, ref allele and the first alternate allele
+\t              tab character
Everything else is printed verbatim.

Examples:

# Print chromosome, position, ref allele and the first alternate allele
 bcftools query -f '%CHROM  %POS  %REF  %ALT{0}\n' file.vcf.gz
# Similar to above, but use tabs instead of spaces, add sample name and genotype
 bcftools query -f '%CHROM\t%POS\t%REF\t%ALT[\t%SAMPLE=%GT]\n' file.vcf.gz
# Print FORMAT/GT fields followed by FORMAT/GT fields
 bcftools query -f 'GQ:[ %GQ] \t GT:[ %GT]\n' file.vcf
# Make a BED file: chr, pos (0-based), end pos (1-based), id
@@ -2505,7 +2519,7 @@
 bcftools query -i'GT="het"' -f'[%CHROM:%POS %SAMPLE %GT %pbinom(AD)\n]' file.vcf
# Print the second value of AC field if bigger than 10. Note the (unfortunate) difference in
 # index subscript notation: formatting expressions (-f) uses "{}" while filtering expressions
 # (-i) use "[]". This is for historic reasons and backward-compatibility.
-bcftools query -f '%AC{1}\n' -i 'AC[1]>10' file.vcf.gz

bcftools reheader [OPTIONS] file.vcf.gz

Modify header of VCF/BCF files, change sample names.

+bcftools query -f '%AC{1}\n' -i 'AC[1]>10' file.vcf.gz

bcftools reheader [OPTIONS] file.vcf.gz

Modify header of VCF/BCF files, change sample names.

-f, --fai FILE
add to the header contig names and their lengths from the provided fasta index file (.fai). @@ -2532,8 +2546,8 @@ --threads INT
see Common Options -

bcftools roh [OPTIONS] file.vcf.gz

A program for detecting runs of homo/autozygosity. Only bi-allelic sites -are considered.

The HMM model:

Notation:
+

bcftools roh [OPTIONS] file.vcf.gz

A program for detecting runs of homo/autozygosity. Only bi-allelic sites +are considered.

The HMM model:

Notation:
   D  = Data, AZ = autozygosity, HW = Hardy-Weinberg (non-autozygosity),
   f  = non-ref allele frequency
 
@@ -2550,7 +2564,7 @@
   HWi = P_i(HW)
 
   P_{i+1}(AZ) = oAZ * max[(1 - tAZ * ci) * AZ{i-1} , tAZ * ci * (1-AZ{i-1})]
-  P_{i+1}(HW) = oHW * max[(1 - tHW * ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}]

General Options:

+ P_{i+1}(HW) = oHW * max[(1 - tHW * ci) * (1-AZ{i-1}) , tHW * ci * AZ{i-1}]

General Options:

--AF-dflt FLOAT
in case allele frequency is not known, use the FLOAT. By default, sites where @@ -2569,7 +2583,7 @@ bgzip and indexed with tabix -s1 -b2 -e2. Sites which are not present in the FILE or have different reference or alternate allele will be skipped. Note that such a file can be easily created from a VCF using: -
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\t%INFO/TAG\n' file.vcf | bgzip -c > freqs.tab.gz
+
    bcftools query -f'%CHROM\t%POS\t%REF,%ALT\t%INFO/TAG\n' file.vcf | bgzip -c > freqs.tab.gz
-b, --buffer-size INT[,INT]
when the entire many-sample file cannot fit into memory, a sliding @@ -2658,7 +2672,7 @@ -T, --targets-file file
see Common Options -

HMM Options:

+

HMM Options:

-a, --hw-to-az FLOAT
P(AZ|HW) transition probability from AZ (autozygous) to HW (Hardy-Weinberg) state @@ -2671,7 +2685,7 @@
estimate HMM parameters using Baum-Welch algorithm, using the convergence threshold FLOAT, e.g. 1e-10 (experimental) -

bcftools sort [OPTIONS] file.bcf

+

bcftools sort [OPTIONS] file.bcf

-m, --max-mem FLOAT[kMG]
Maximum memory to use. Approximate, affects the number of temporary files written @@ -2689,7 +2703,7 @@ -T, --temp-dir DIR
Use this directory to store temporary files -

bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]

Parses VCF or BCF and produces text file stats which is suitable for machine +

bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]

Parses VCF or BCF and produces text file stats which is suitable for machine processing and can be plotted using plot-vcfstats. When two files are given, the program generates separate stats for intersection and the complements. By default only sites are compared, -s/-S must given to include also sample @@ -2699,7 +2713,7 @@ etc. are printed. When two VCF files are given, then stats such as concordance (Genotype concordance by non-reference allele frequency, Genotype concordance by sample, Non-Reference Discordance) -and correlation are also printed. Per-site discordance (PSD) is also printed in --verbose mode.

+and correlation are also printed. Per-site discordance (PSD) is also printed in --verbose mode.

--af-bins LIST|FILE
comma separated list of allele frequency bins (e.g. 0.1,0.5,1) @@ -2736,7 +2750,7 @@ tab-delimited file with exons for indel frameshifts statistics. The columns of the file are CHR, FROM, TO, with 1-based, inclusive, positions. The file is BGZF-compressed and indexed with tabix -
    tabix -s1 -b2 -e3 file.gz
+
    tabix -s1 -b2 -e3 file.gz
-f, --apply-filters LIST
see Common Options @@ -2786,8 +2800,8 @@ -v, --verbose
produce verbose per-site and per-sample output -

bcftools view [OPTIONS] file.vcf.gz [REGION […]]

View, subset and filter VCF or BCF files by position and filtering expression. -Convert between VCF and BCF. Former bcftools subset.

Output options

+

bcftools view [OPTIONS] file.vcf.gz [REGION […]]

View, subset and filter VCF or BCF files by position and filtering expression. +Convert between VCF and BCF. Former bcftools subset.

Output options

-G, --drop-genotypes
drop individual genotype information (after subsetting if -s option is set) @@ -2813,7 +2827,7 @@
see Common Options

-o, --output FILE: - output file name. If not present, the default is to print to standard output (stdout).

+ output file name. If not present, the default is to print to standard output (stdout).

-r, --regions chr|chr:pos|chr:from-to|chr:from-[,…]
see Common Options @@ -2833,7 +2847,7 @@ --threads INT
see Common Options -

Subset options:

+

Subset options:

-a, --trim-alt-alleles
remove alleles not seen in the genotype fields from the ALT column. Note that if no alternate allele @@ -2858,7 +2872,7 @@
see Common Options. Note that it is possible to create multiple subsets simultaneously using the split plugin. -

Filter options:

Note that filter options below dealing with counting the number of alleles +

Filter options:

Note that filter options below dealing with counting the number of alleles will, for speed, first check for the values of AC and AN in the INFO column to avoid parsing all the genotype (FORMAT/GT) fields in the VCF. This means that a filter like --min-af 0.1 will be calculated from INFO/AC and INFO/AN @@ -2869,7 +2883,7 @@ and some are inherently ambiguous, for example allele counts can be taken from the INFO column when present but calculated on the fly when absent. Therefore it is strongly recommended to spell out the required order explicitly by separating such commands into two steps. (Make sure to use the -O u option -when piping!)

+when piping!)

-c, --min-ac INT[:nref|:alt1|:minor|:major|:'nonmajor']
minimum allele count (INFO/AC) of sites to be printed. @@ -2980,10 +2994,10 @@ -X, --exclude-private
exclude sites where only the subset samples carry an non-reference allele -

bcftools help [COMMAND] | bcftools --help [COMMAND]

Display a brief usage message listing the bcftools commands available. +

bcftools help [COMMAND] | bcftools --help [COMMAND]

Display a brief usage message listing the bcftools commands available. If the name of a command is also given, e.g., bcftools help view, the detailed -usage message for that particular command is displayed.

bcftools [--version|-v]

Display the version numbers and copyright information for bcftools and the -important libraries used by bcftools.

bcftools [--version-only]

Display the full bcftools version number in a machine-readable format.

EXPRESSIONS

These filtering expressions are accepted by most of the commands.

Valid expressions may contain:

  • +usage message for that particular command is displayed.

bcftools [--version|-v]

Display the version numbers and copyright information for bcftools and the +important libraries used by bcftools.

bcftools [--version-only]

Display the full bcftools version number in a machine-readable format.

EXPRESSIONS

These filtering expressions are accepted by most of the commands.

Valid expressions may contain:

  • numerical constants, string constants, file names (this is currently supported only to filter by the ID column)

    1, 1.0, 1e-4
    @@ -3093,7 +3107,7 @@
     the section Optional Compilation with Perl in the INSTALL file for help
     and misc/demo-flt.pl for a working example. The demo defined the perl subroutine
     "severity" which can be invoked from the command line as follows:
    -

    perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3

Notes:

  • +

    perl:path/to/script.pl; perl.severity(INFO/CSQ) > 3

Notes:

  • String comparisons and regular expressions are case-insensitive
  • Comma in strings is interpreted as a separator and when multiple values are compared, the OR logic is used. @@ -3117,9 +3131,9 @@ -e 'TAG[0]!=1' .. true

Examples:

MIN(DV)>5       .. selects the whole site, evaluates min across all values and samples
SMPL_MIN(DV)>5  .. selects matching samples, evaluates within samples
MIN(DV/DP)>0.3
MIN(DP)>10 & MIN(DV)>3
FMT/DP>10  & FMT/GQ>10 .. both conditions must be satisfied within one sample
FMT/DP>10 && FMT/GQ>10 .. the conditions can be satisfied in different samples
QUAL>10 |  FMT/GQ>10   .. true for sites with QUAL>10 or a sample with GQ>10, but selects only samples with GQ>10
QUAL>10 || FMT/GQ>10   .. true for sites with QUAL>10 or a sample with GQ>10, plus selects all samples at such sites
TYPE="snp" && QUAL>=10 && (DP4[2]+DP4[3] > 2)
COUNT(GT="hom")=0      .. no homozygous genotypes at the site
AVG(GQ)>50             .. average (arithmetic mean) of genotype qualities bigger than 50
ID=@file       .. selects lines with ID present in the file
ID!=@~/file    .. skip lines with ID present in the ~/file
MAF[0]<0.05    .. select rare variants at 5% cutoff
POS>=100   .. restrict your range query, e.g. 20:100-200 to strictly sites with POS in that range.

Shell expansion:

Note that expressions must often be quoted because some characters have special meaning in the shell. An example of expression enclosed in single quotes which cause -that the whole expression is passed to the program as intended:

bcftools view -i '%ID!="." & MAF[0]<0.01'

Please refer to the documentation of your shell for details.

SCRIPTS AND OPTIONS

plot-vcfstats [OPTIONS] file.vchk […]

Script for processing output of bcftools stats. It can merge +that the whole expression is passed to the program as intended:

bcftools view -i '%ID!="." & MAF[0]<0.01'

Please refer to the documentation of your shell for details.

SCRIPTS AND OPTIONS

plot-vcfstats [OPTIONS] file.vchk […]

Script for processing output of bcftools stats. It can merge results from multiple outputs (useful when running the stats for each -chromosome separately), plots graphs and creates a PDF presentation.

+chromosome separately), plots graphs and creates a PDF presentation.

-m, --merge
Merge vcfstats files to STDOUT, skip plotting. @@ -3157,15 +3171,15 @@ bcftools stats -s - > file.vchk
# Plot the stats
 plot-vcfstats -p outdir file.vchk
# The final looks can be customized by editing the generated
 # 'outdir/plot.py' script and re-running manually
-cd outdir && python plot.py && pdflatex summary.tex

PERFORMANCE

HTSlib was designed with BCF format in mind. When parsing VCF files, all records +cd outdir && python plot.py && pdflatex summary.tex

PERFORMANCE

HTSlib was designed with BCF format in mind. When parsing VCF files, all records are internally converted into BCF representation. Simple operations, like removing a single column from a VCF file, can be therefore done much faster with standard UNIX commands, such as awk or cut. Therefore it is recommended to use BCF as input/output format whenever possible to avoid -large overhead of the VCF → BCF → VCF conversion.

BUGS

Please report any bugs you encounter on the github website: http://github.com/samtools/bcftools

AUTHORS

Heng Li from the Sanger Institute wrote the original C version of htslib, +large overhead of the VCF → BCF → VCF conversion.

BUGS

Please report any bugs you encounter on the github website: http://github.com/samtools/bcftools

AUTHORS

Heng Li from the Sanger Institute wrote the original C version of htslib, samtools and bcftools. Bob Handsaker from the Broad Institute implemented the BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining and further developing bcftools. Many other people contributed to the program and to the file format specifications, both directly and indirectly by -providing patches, testing and reporting bugs. We thank them all.

RESOURCES

BCFtools GitHub website: http://github.com/samtools/bcftools

Samtools GitHub website: http://github.com/samtools/samtools

HTSlib GitHub website: http://github.com/samtools/htslib

File format specifications: http://samtools.github.io/hts-specs

BCFtools documentation: http://samtools.github.io/bcftools

BCFtools wiki page: https://github.com/samtools/bcftools/wiki

COPYING

The MIT/Expat License or GPL License, see the LICENSE document for details. -Copyright (c) Genome Research Ltd.

\ No newline at end of file +providing patches, testing and reporting bugs. We thank them all.

RESOURCES

BCFtools GitHub website: http://github.com/samtools/bcftools

Samtools GitHub website: http://github.com/samtools/samtools

HTSlib GitHub website: http://github.com/samtools/htslib

File format specifications: http://samtools.github.io/hts-specs

BCFtools documentation: http://samtools.github.io/bcftools

BCFtools wiki page: https://github.com/samtools/bcftools/wiki

COPYING

The MIT/Expat License or GPL License, see the LICENSE document for details. +Copyright (c) Genome Research Ltd.

From 8c37921cba7214434dd51c311398b11647d5a130 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 25 Feb 2021 14:17:21 +0000 Subject: [PATCH 69/81] Fix a signed comparison as pointed by https://github.com/samtools/bcftools/issues/1400#issuecomment-785784842 --- consensus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus.c b/consensus.c index bd5535e56..a232174c8 100644 --- a/consensus.c +++ b/consensus.c @@ -555,7 +555,7 @@ static void apply_variant(args_t *args, bcf1_t *rec) for (i=0; in; i++) { if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; } - if ( ptr[i]==bcf_int8_vector_end ) break; + if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; int jalt = bcf_gt_allele(ptr[i]); if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); if ( fallback_alt <= 0 ) fallback_alt = jalt; From 71d744f86d305d8afc10313f2b0c4b358efeb67b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 25 Feb 2021 16:00:18 +0000 Subject: [PATCH 70/81] Make warnings into errors for automated tests --- .appveyor.yml | 2 +- .cirrus.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 49aa617a2..f2cd428b4 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -43,7 +43,7 @@ build_script: - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - "sh -lc \"(cd htslib; autoreconf -i)\"" - - "sh -lc \"autoreconf -i && ./configure && make -j2\"" + - "sh -lc \"autoreconf -i && ./configure --enable-werror && make -j2\"" test_script: - set HOME=. diff --git a/.cirrus.yml b/.cirrus.yml index 37763567e..a3e46bbe8 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -34,7 +34,7 @@ compile_template: &COMPILE if test "$USE_CONFIG" = "yes"; then (cd $HTSDIR && autoreconf -i) autoreconf -i - ./configure || (cat config.log; /bin/false) + ./configure --enable-werror || (cat config.log; /bin/false) make -j3 else make -j3 plugindir=$CIRRUS_WORKING_DIR/plugins -e From 2d6bef97854c013b95b6dd7afaf07ff24bb8e937 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 8 Mar 2021 15:49:46 +0000 Subject: [PATCH 71/81] Print the maximum number of CSQs per sample once finished. --- csq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/csq.c b/csq.c index 7a1a826a7..d9e3bc4d9 100644 --- a/csq.c +++ b/csq.c @@ -1419,6 +1419,11 @@ void init_data(args_t *args) void destroy_data(args_t *args) { + if ( args->ncsq_small_warned ) + fprintf(stderr, + "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" + " the limit can be increased by running with `--ncsq %d`.\n",args->ncsq_max/8,1+args->ncsq_small_warned/2); + regidx_destroy(args->idx_cds); regidx_destroy(args->idx_utr); regidx_destroy(args->idx_exon); @@ -3102,8 +3107,8 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); if ( !args->ncsq_small_warned ) fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; } + if ( args->ncsq_small_warned < icsq ) args->ncsq_small_warned = icsq; break; } if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; @@ -3749,6 +3754,7 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); args->ncsq_small_warned = 1; } + if ( args->ncsq_small_warned < icsq ) args->ncsq_small_warned = icsq; break; } if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; From 8c8ff136698e3310eae8f26cd7497a862fd485b1 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 8 Mar 2021 15:53:01 +0000 Subject: [PATCH 72/81] Fix minor typos in NEWS --- NEWS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index de8d796ef..7b0b49cb9 100644 --- a/NEWS +++ b/NEWS @@ -11,7 +11,7 @@ Changes affecting the whole of bcftools, or multiple commands: * Fix N_PASS and F_PASS to behave according to expectation when reverse logic is used (#1397). This fix has the side effect of `query` (or programs like `+trio-stats`) behaving differently with these expressions, - operating now in site-oritented rather than sample-oriented mode. For + operating now in site-oriented rather than sample-oriented mode. For example, the new behavior could be: bcftools query -f'[%POS %SAMPLE %GT\n]' -i'N_PASS(GT="alt")==1' 11 A 0/0 @@ -86,7 +86,7 @@ Changes affecting specific commands: - Any sensible Number=R,Type=Integer annotation can be used with -G, such as AD or QS - - Don't trim QUAL; although usefuleness of this change is questionable for + - Don't trim QUAL; although usefulness of this change is questionable for true probabilistic interpretation (such high precision is unrealistic), using QUAL as a score rather than probability is helpful and permits more fine-grained filtering From 67b23db863953ca5d56dcc715e27da9e30e0c246 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 9 Mar 2021 12:00:40 +0000 Subject: [PATCH 73/81] Happy New Year! --- main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.c b/main.c index eafbeec90..f89271108 100644 --- a/main.c +++ b/main.c @@ -1,6 +1,6 @@ /* main.c -- main bcftools command front-end. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek @@ -251,7 +251,7 @@ int main(int argc, char *argv[]) if (argc < 2) { usage(stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2020 Genome Research Ltd.\n", bcftools_version(), hts_version()); + printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2021 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL printf("License GPLv3+: GNU GPL version 3 or later \n"); #else From 56f471e68640188c5e248ab8ee634ada12b64320 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 9 Mar 2021 14:33:49 +0000 Subject: [PATCH 74/81] Comment out unused function to prevent `./configure --enable-werror` failures on tests platforms --- plugins/trio-dnm2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plugins/trio-dnm2.c b/plugins/trio-dnm2.c index 4f0e0a8a9..bb6bd0254 100644 --- a/plugins/trio-dnm2.c +++ b/plugins/trio-dnm2.c @@ -722,10 +722,12 @@ static inline double phred2log(double phred) { return -phred/4.3429; } +#if 0 static inline double subtract_num_log(double a_num, double b_log) { return log(a_num - exp(b_log)); } +#endif static inline double subtract_log(double a_log, double b_log) { if ( b_log==-HUGE_VAL ) return a_log; From b2c18f7b95181697de5d5fb71f49f9dfa4858dfb Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 9 Mar 2021 16:54:16 +0000 Subject: [PATCH 75/81] Fix incorrect output when query combines -s/-S with -i/-e Incorrect fields were printed in the per-sample output when subset of samples was requested via -s/-S and the order of samples in the header was different from the requested -s/-S order. Fixes #1435. --- NEWS | 6 ++++++ convert.c | 6 +++--- test/query.82.out | 2 ++ test/query.83.out | 2 ++ test/query.filter.12.vcf | 7 +++++++ test/test.pl | 2 ++ 6 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 test/query.82.out create mode 100644 test/query.83.out create mode 100644 test/query.filter.12.vcf diff --git a/NEWS b/NEWS index 7b0b49cb9..f6df7f381 100644 --- a/NEWS +++ b/NEWS @@ -134,6 +134,12 @@ Changes affecting specific commands: - New option `--old-rec-tag` to indicate the original variant +* bcftools query: + + - Incorrect fields were printed in the per-sample output when subset + of samples was requested via -s/-S and the order of samples in the + header was different from the requested -s/-S order (#1435) + * bcftools +prune: - New options --random-seed and --nsites-per-win-mode (#1050) diff --git a/convert.c b/convert.c index 9a0bc58b2..75f030657 100644 --- a/convert.c +++ b/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2018 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: Petr Danecek @@ -1608,7 +1608,8 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) for (js=0; jsnsamples; js++) { // Skip samples when filtering was requested - if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[js] ) continue; + int ks = convert->samples[js]; + if ( convert->subset_samples && *convert->subset_samples && !(*convert->subset_samples)[ks] ) continue; // Here comes a hack designed for TBCSQ. When running on large files, // such as 1000GP, there are too many empty fields in the output and @@ -1617,7 +1618,6 @@ int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str) // brackets here. This may be changed in future, time will show... size_t l_start = str->l; - int ks = convert->samples[js]; for (k=i; kfmt[k].type == T_MASK ) diff --git a/test/query.82.out b/test/query.82.out new file mode 100644 index 000000000..a778066fa --- /dev/null +++ b/test/query.82.out @@ -0,0 +1,2 @@ +1:1001 1=1/1 0=0/0 +1:1003 1=1/1 3=3/3 diff --git a/test/query.83.out b/test/query.83.out new file mode 100644 index 000000000..e4ce52764 --- /dev/null +++ b/test/query.83.out @@ -0,0 +1,2 @@ +1:1001 0=0/0 1=1/1 +1:1003 1=1/1 3=3/3 diff --git a/test/query.filter.12.vcf b/test/query.filter.12.vcf new file mode 100644 index 000000000..7ef820969 --- /dev/null +++ b/test/query.filter.12.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 0 1 2 3 +1 1001 . A C,G,T . . . GT 0/0 1/1 2/2 ./. +1 1002 . A C,G,T . . . GT ./0 ./1 2/2 ./. +1 1003 . A C,G,T . . . GT 0/. 1/1 ./2 3/3 diff --git a/test/test.pl b/test/test.pl index abafeb03d..0a81f064a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -204,6 +204,8 @@ test_vcf_query($opts,in=>'query.filter.10',out=>'query.73.out',args=>q[-f'%POS %NUM_TAG\\n' -i'COUNT(INFO/NUM_TAG)=2']); test_vcf_query($opts,in=>'query.filter.10',out=>'query.74.out',args=>q[-f'%POS %STR_TAG\\n' -i'COUNT(INFO/STR_TAG)=2']); test_vcf_query($opts,in=>'query',out=>'query.75.out',args=>q[-f '%CHROM:%POS\\t%N_PASS(GT="alt" & GQ>110)\\t[\\t%GT]\\t[\\t%GQ]\n']); +test_vcf_query($opts,in=>'query.filter.12',out=>'query.82.out',args=>q[-f '%CHROM:%POS[\\t%SAMPLE=%GT]\\n' -e 'GT="mis"' -s 1,3,0]); +test_vcf_query($opts,in=>'query.filter.12',out=>'query.83.out',args=>q[-f '%CHROM:%POS[\\t%SAMPLE=%GT]\\n' -e 'GT="mis"' -s 0,1,3]); test_vcf_norm($opts,in=>'norm',out=>'norm.out',fai=>'norm',args=>'-cx'); test_vcf_norm($opts,in=>'norm.split',out=>'norm.split.out',args=>'-m-'); test_vcf_norm($opts,in=>'norm.split.2',out=>'norm.split.2.out',args=>'-m-'); From f1cb97fed6276dc1d6d6003416851626bd3c3c5d Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 9 Mar 2021 17:01:14 +0000 Subject: [PATCH 76/81] Update documentation to mention GT="mis" as well. See also #1435 --- doc/bcftools.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/bcftools.txt b/doc/bcftools.txt index 4f6ac8260..f78339e14 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -3051,10 +3051,10 @@ about the distinction between "&&" vs "&" and "||" vs "|". DP=".", DP!=".", ALT="." -* missing genotypes can be matched regardless of phase and ploidy (".|.", "./.", ".") +* missing genotypes can be matched regardless of phase and ploidy (".|.", "./.", ".", "0|.") using these expressions - GT~"\.", GT!~"\." + GT="mis", GT~"\.", GT!~"\." * missing genotypes can be matched including the phase and ploidy (".|.", "./.", ".") using these expressions From 1f1e7667ffc1235f31a82e2093f037338acbb4e7 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 10 Mar 2021 16:48:14 +0000 Subject: [PATCH 77/81] The `csq --ncsq` option was handled incorrectly, not taking into account reserved BCF values. Truncated or incorrect output was produced by the %TBCSQ formatting expression in `bcftools query`. To account for the reserved values, the new default value is --ncsq 15. Resolves #1428 --- NEWS | 5 +++ convert.c | 6 +-- csq.c | 98 +++++++++++++++++++++++++------------------- doc/bcftools.txt | 7 ++-- test/csq.2.gff | 104 +++++++++++++++++++++++++++++++++++++++++++++++ test/csq.2.out | 60 +++++++++++++++++++++++++++ test/csq.2.vcf | 8 ++++ test/csq.3.out | 96 +++++++++++++++++++++++++++++++++++++++++++ test/test.pl | 11 ++++- 9 files changed, 346 insertions(+), 49 deletions(-) create mode 100644 test/csq.2.gff create mode 100644 test/csq.2.out create mode 100644 test/csq.2.vcf create mode 100644 test/csq.3.out diff --git a/NEWS b/NEWS index f6df7f381..e97844805 100644 --- a/NEWS +++ b/NEWS @@ -101,6 +101,11 @@ Changes affecting specific commands: - Fix a bug wich caused incorrect FORMAT/BCSQ formatting at sites with too many per-sample consequences + - Fix a bug which incorrectly handled the --ncsq parameter and could clash + with reserved BCF values, consequently producing truncated or even incorrect + output of the %TBCSQ formatting expression in `bcftools query`. To account + for the reserved values, the new default value is --ncsq 15 (#1428) + * bcftools +fill-tags: - MAF definition revised for multiallelic sites, the second most common diff --git a/convert.c b/convert.c index 75f030657..ea0cab111 100644 --- a/convert.c +++ b/convert.c @@ -506,7 +506,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam type_t val = x[j]; \ if ( !val ) continue; \ for (i=0; istr[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ + if ( val & (mask<str[(j*30+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \ } \ } \ if ( fmt->subscript<0 || fmt->subscript==2 ) \ @@ -516,7 +516,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam type_t val = x[j]; \ if ( !val ) continue; \ for (i=1; istr[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ + if ( val & (1<str[(j*30+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \ } \ } \ } @@ -524,7 +524,7 @@ static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isam { case BCF_BT_INT8: BRANCH(uint8_t, 8); break; case BCF_BT_INT16: BRANCH(uint16_t,16); break; - case BCF_BT_INT32: BRANCH(uint32_t,32); break; + case BCF_BT_INT32: BRANCH(uint32_t,30); break; // 2 bytes unused to account for the reserved BCF values default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break; } #undef BRANCH diff --git a/csq.c b/csq.c index d9e3bc4d9..8550a1199 100644 --- a/csq.c +++ b/csq.c @@ -590,8 +590,8 @@ typedef struct _args_t char *bcsq_tag; int argc, output_type; int phase, verbosity, local_csq, record_cmd_line; - int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ - int ncsq_small_warned; + int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) + int ncsq2_small_warned; int brief_predictions; int rid; // current chromosome @@ -1346,9 +1346,19 @@ void init_gff(args_t *args) khash_str2int_destroy_free(aux->ignored_biotypes); } +static inline int ncsq2_to_nfmt(int ncsq2) +{ + return 1 + (ncsq2 - 1) / 30; +} +static inline void icsq2_to_bit(int icsq2, int *ival, int *ibit) +{ + *ival = icsq2 / 30; + *ibit = icsq2 % 30; +} + void init_data(args_t *args) { - args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32; + args->nfmt_bcsq = ncsq2_to_nfmt(args->ncsq2_max); if ( args->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname); init_gff(args); @@ -1419,10 +1429,10 @@ void init_data(args_t *args) void destroy_data(args_t *args) { - if ( args->ncsq_small_warned ) + if ( args->ncsq2_small_warned ) fprintf(stderr, "Note: Some samples had too many consequences to be represented in %d bytes. If you need to record them all,\n" - " the limit can be increased by running with `--ncsq %d`.\n",args->ncsq_max/8,1+args->ncsq_small_warned/2); + " the limit can be increased by running with `--ncsq %d`.\n",ncsq2_to_nfmt(args->ncsq2_max)/8,1+args->ncsq2_small_warned/2); regidx_destroy(args->idx_cds); regidx_destroy(args->idx_utr); @@ -3097,22 +3107,24 @@ static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int iha { csq_t *csq = node->csq_list + i; vrec_t *vrec = csq->vrec; - int icsq = 2*csq->idx + ihap; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + int icsq2 = 2*csq->idx + ihap; + if ( icsq2 >= args->ncsq2_max ) // more than ncsq2_max consequences, so can't fit it in FMT { - if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) + if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) { fprintf(stderr, "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,csq->idx); - if ( !args->ncsq_small_warned ) + if ( !args->ncsq2_small_warned ) fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); } - if ( args->ncsq_small_warned < icsq ) args->ncsq_small_warned = icsq; + if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; break; } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + int ival, ibit; + icsq2_to_bit(icsq2, &ival,&ibit); + if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; + vrec->smpl[ismpl*args->nfmt_bcsq + ival] |= 1 << ibit; } } @@ -3741,24 +3753,26 @@ void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec) { if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue; - int icsq = 2*csq->idx + j; - if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT + int icsq2 = 2*csq->idx + j; + if ( icsq2 >= args->ncsq2_max ) // more than ncsq_max consequences, so can't fit it in FMT { int ismpl = args->smpl->idx[i]; - if ( args->verbosity && (!args->ncsq_small_warned || args->verbosity > 1) ) + if ( args->verbosity && (!args->ncsq2_small_warned || args->verbosity > 1) ) { fprintf(stderr, "Warning: Too many consequences for sample %s at %s:%"PRId64", keeping the first %d and skipping the rest.\n", - args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq+1); - if ( !args->ncsq_small_warned ) + args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),(int64_t) vrec->line->pos+1,icsq2+1); + if ( !args->ncsq2_small_warned ) fprintf(stderr," The limit can be increased by setting the --ncsq parameter. This warning is printed only once.\n"); - args->ncsq_small_warned = 1; + args->ncsq2_small_warned = 1; } - if ( args->ncsq_small_warned < icsq ) args->ncsq_small_warned = icsq; + if ( args->ncsq2_small_warned < icsq2 ) args->ncsq2_small_warned = icsq2; break; } - if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32; - vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32); + int ival, ibit; + icsq2_to_bit(icsq2, &ival,&ibit); + if ( vrec->nfmt < 1 + ival ) vrec->nfmt = 1 + ival; + vrec->smpl[i*args->nfmt_bcsq + ival] |= 1 << ibit; } } } @@ -4057,39 +4071,39 @@ static const char *usage(void) return "\n" "About: Haplotype-aware consequence caller.\n" - "Usage: bcftools csq [options] in.vcf\n" + "Usage: bcftools csq [OPTIONS] in.vcf\n" "\n" "Required options:\n" - " -f, --fasta-ref reference file in fasta format\n" - " -g, --gff-annot gff3 annotation file\n" + " -f, --fasta-ref FILE reference file in fasta format\n" + " -g, --gff-annot FILE gff3 annotation file\n" "\n" "CSQ options:\n" " -b, --brief-predictions annotate with abbreviated protein-changing predictions\n" - " -c, --custom-tag use this tag instead of the default BCSQ\n" + " -c, --custom-tag STRING use this tag instead of the default BCSQ\n" " -l, --local-csq localized predictions, consider only one VCF record at a time\n" - " -n, --ncsq maximum number of per-haplotype consequences to consider for each site [16]\n" - " -p, --phase how to handle unphased heterozygous genotypes: [r]\n" + " -n, --ncsq INT maximum number of per-haplotype consequences to consider for each site [15]\n" + " -p, --phase a|m|r|R|s how to handle unphased heterozygous genotypes: [r]\n" " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n" " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n" " r: require phased GTs, throw an error on unphased het GTs\n" " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n" " s: skip unphased hets\n" "Options:\n" - " -e, --exclude exclude sites for which the expression is true\n" + " -e, --exclude EXPR exclude sites for which the expression is true\n" " --force run even if some sanity checks fail\n" - " -i, --include select sites for which the expression is true\n" + " -i, --include EXPR select sites for which the expression is true\n" " --no-version do not append version and command line to the header\n" - " -o, --output write output to a file [standard output]\n" - " -O, --output-type b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" + " -o, --output FILE write output to a file [standard output]\n" + " -O, --output-type b|u|z|v|t b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n" " v: uncompressed VCF, t: plain tab-delimited text output [v]\n" - " -r, --regions restrict to comma-separated list of regions\n" - " -R, --regions-file restrict to regions listed in a file\n" - " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n" - " -S, --samples-file samples to include\n" - " -t, --targets similar to -r but streams rather than index-jumps\n" - " -T, --targets-file similar to -R but streams rather than index-jumps\n" - " --threads use multithreading with worker threads [0]\n" - " -v, --verbose verbosity level 0-2 [1]\n" + " -r, --regions REGION restrict to comma-separated list of regions\n" + " -R, --regions-file FILE restrict to regions listed in a file\n" + " -s, --samples -|LIST samples to include or \"-\" to apply all variants and ignore samples\n" + " -S, --samples-file FILE samples to include\n" + " -t, --targets REGION similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE similar to -R but streams rather than index-jumps\n" + " --threads INT use multithreading with worker threads [0]\n" + " -v, --verbose INT verbosity level 0-2 [1]\n" "\n" "Example:\n" " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n" @@ -4106,7 +4120,7 @@ int main_csq(int argc, char *argv[]) args->argc = argc; args->argv = argv; args->output_type = FT_VCF; args->bcsq_tag = "BCSQ"; - args->ncsq_max = 2*16; + args->ncsq2_max = 2*(16-1); // 1 bit is reserved for BCF missing values args->verbosity = 1; args->record_cmd_line = 1; @@ -4171,8 +4185,8 @@ int main_csq(int argc, char *argv[]) case 'f': args->fa_fname = optarg; break; case 'g': args->gff_fname = optarg; break; case 'n': - args->ncsq_max = 2 * atoi(optarg); - if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg); + args->ncsq2_max = 2 * atoi(optarg); + if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg); break; case 'o': args->output_fname = optarg; break; case 'O': diff --git a/doc/bcftools.txt b/doc/bcftools.txt index f78339e14..899eb5447 100644 --- a/doc/bcftools.txt +++ b/doc/bcftools.txt @@ -1119,7 +1119,7 @@ with the *-c* option). The latter is a bitmask of indexes to INFO/BCSQ, with interleaved haplotypes. See the usage examples below for using the %TBCSQ converter in *query* for extracting a more human readable form from this bitmask. The construction of the bitmask limits the number of consequences -that can be referenced per sample in the FORMAT/BCSQ tags. By default this is 16, but +that can be referenced per sample in the FORMAT/BCSQ tags. By default this is 15, but if more are required, see the *--ncsq* option. The program requires on input a VCF/BCF file, the reference genome in fasta @@ -1204,8 +1204,9 @@ output VCF and are ignored for the prediction analysis. *-n, --ncsq* 'INT':: maximum number of per-haplotype consequences to consider for each site. The INFO/BCSQ column includes all consequences, but only the first 'INT' will be referenced by the FORMAT/BCSQ fields. - The default value is 16 which corresponds to one integer per diploid - sample. Note that increasing the value leads to increased memory and is rarely necessary. + The default value is 15 which corresponds to one 32-bit integer per diploid + sample, after accounting for values reserved by the BCF specification. + Note that increasing the value leads to increased size of the output BCF. *--no-version*:: see *<>* diff --git a/test/csq.2.gff b/test/csq.2.gff new file mode 100644 index 000000000..0b1811217 --- /dev/null +++ b/test/csq.2.gff @@ -0,0 +1,104 @@ +##gff-version 3 +#! This file shows which fields are used and required by `bcftools +csq`. It is a trimmed version +#! of the GFF3 format, see an example of the full format here +#! ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/ +#! +### +1 . gene 90 110 . + . ID=gene:ENSG001;Name=XYZ;biotype=protein_coding +1 . transcript 99 110 . + . ID=transcript:ENST001;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST001 +1 . transcript 99 110 . + . ID=transcript:ENST002;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST002 +1 . transcript 99 110 . + . ID=transcript:ENST003;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST003 +1 . transcript 99 110 . + . ID=transcript:ENST004;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST004 +1 . transcript 99 110 . + . ID=transcript:ENST005;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST005 +1 . transcript 99 110 . + . ID=transcript:ENST006;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST006 +1 . transcript 99 110 . + . ID=transcript:ENST007;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST007 +1 . transcript 99 110 . + . ID=transcript:ENST008;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST008 +1 . transcript 99 110 . + . ID=transcript:ENST009;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST009 +1 . transcript 99 110 . + . ID=transcript:ENST010;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST010 +1 . transcript 99 110 . + . ID=transcript:ENST011;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST011 +1 . transcript 99 110 . + . ID=transcript:ENST012;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST012 +1 . transcript 99 110 . + . ID=transcript:ENST013;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST013 +1 . transcript 99 110 . + . ID=transcript:ENST014;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST014 +1 . transcript 99 110 . + . ID=transcript:ENST015;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST015 +1 . transcript 99 110 . + . ID=transcript:ENST016;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST016 +1 . transcript 99 110 . + . ID=transcript:ENST017;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST017 +1 . transcript 99 110 . + . ID=transcript:ENST018;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST018 +1 . transcript 99 110 . + . ID=transcript:ENST019;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST019 +1 . transcript 99 110 . + . ID=transcript:ENST020;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST020 +1 . transcript 99 110 . + . ID=transcript:ENST021;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST021 +1 . transcript 99 110 . + . ID=transcript:ENST022;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST022 +1 . transcript 99 110 . + . ID=transcript:ENST023;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST023 +1 . transcript 99 110 . + . ID=transcript:ENST024;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST024 +1 . transcript 99 110 . + . ID=transcript:ENST025;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST025 +1 . transcript 99 110 . + . ID=transcript:ENST026;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST026 +1 . transcript 99 110 . + . ID=transcript:ENST027;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST027 +1 . transcript 99 110 . + . ID=transcript:ENST028;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST028 +1 . transcript 99 110 . + . ID=transcript:ENST029;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST029 +1 . transcript 99 110 . + . ID=transcript:ENST030;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST030 +1 . transcript 99 110 . + . ID=transcript:ENST031;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST031 +1 . transcript 99 110 . + . ID=transcript:ENST032;Parent=gene:ENSG001;biotype=protein_coding +1 . CDS 99 110 . + 1 Parent=transcript:ENST032 +2 . gene 90 110 . + . ID=gene:ENSG101;Name=XYZ;biotype=protein_coding +2 . transcript 99 110 . + . ID=transcript:ENST101;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST101 +2 . transcript 99 110 . + . ID=transcript:ENST102;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST102 +2 . transcript 99 110 . + . ID=transcript:ENST103;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST103 +2 . transcript 99 110 . + . ID=transcript:ENST104;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST104 +2 . transcript 99 110 . + . ID=transcript:ENST105;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST105 +2 . transcript 99 110 . + . ID=transcript:ENST106;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST106 +2 . transcript 99 110 . + . ID=transcript:ENST107;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST107 +2 . transcript 99 110 . + . ID=transcript:ENST108;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST108 +2 . transcript 99 110 . + . ID=transcript:ENST109;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST109 +2 . transcript 99 110 . + . ID=transcript:ENST110;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST110 +2 . transcript 99 110 . + . ID=transcript:ENST111;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST111 +2 . transcript 99 110 . + . ID=transcript:ENST112;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST112 +2 . transcript 99 110 . + . ID=transcript:ENST113;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST113 +2 . transcript 99 110 . + . ID=transcript:ENST114;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST114 +2 . transcript 99 110 . + . ID=transcript:ENST115;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST115 +2 . transcript 99 110 . + . ID=transcript:ENST116;Parent=gene:ENSG101;biotype=protein_coding +2 . CDS 99 110 . + 1 Parent=transcript:ENST116 diff --git a/test/csq.2.out b/test/csq.2.out new file mode 100644 index 000000000..1505d3c80 --- /dev/null +++ b/test/csq.2.out @@ -0,0 +1,60 @@ +missense|XYZ|ENST001|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST001|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST019|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST019|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST020|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST020|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST021|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST021|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST022|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST022|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST023|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST023|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST024|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST024|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST025|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST025|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST026|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST026|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST027|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST027|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST028|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST028|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST029|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST029|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST030|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST030|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST031|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST031|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST032|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST032|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST101|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST101|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST103|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST103|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST104|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST104|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST105|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST105|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST106|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST106|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST107|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST107|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST108|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST108|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST109|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST109|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST110|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST110|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST111|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST111|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST112|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST112|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST113|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST113|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST114|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST114|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST115|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST115|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST116|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST116|protein_coding|+|2V>2I|103G>A diff --git a/test/csq.2.vcf b/test/csq.2.vcf new file mode 100644 index 000000000..974f6810f --- /dev/null +++ b/test/csq.2.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##reference=file://some/path/human_g1k_v37.fasta +##contig= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT smpl +2 103 . G A 1 . . GT 1|1 +1 103 . G A 1 . . GT 1|1 diff --git a/test/csq.3.out b/test/csq.3.out new file mode 100644 index 000000000..6f341dcd4 --- /dev/null +++ b/test/csq.3.out @@ -0,0 +1,96 @@ +missense|XYZ|ENST001|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST001|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST002|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST002|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST003|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST003|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST004|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST004|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST005|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST005|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST006|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST006|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST007|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST007|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST008|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST008|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST009|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST009|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST010|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST010|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST011|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST011|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST012|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST012|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST013|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST013|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST014|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST014|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST015|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST015|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST016|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST016|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST017|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST017|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST018|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST018|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST019|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST019|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST020|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST020|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST021|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST021|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST022|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST022|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST023|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST023|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST024|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST024|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST025|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST025|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST026|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST026|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST027|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST027|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST028|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST028|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST029|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST029|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST030|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST030|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST031|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST031|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST032|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST032|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST101|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST101|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST102|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST102|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST103|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST103|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST104|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST104|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST105|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST105|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST106|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST106|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST107|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST107|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST108|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST108|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST109|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST109|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST110|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST110|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST111|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST111|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST112|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST112|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST113|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST113|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST114|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST114|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST115|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST115|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST116|protein_coding|+|2V>2I|103G>A +missense|XYZ|ENST116|protein_coding|+|2V>2I|103G>A diff --git a/test/test.pl b/test/test.pl index 0a81f064a..40e8579b5 100755 --- a/test/test.pl +++ b/test/test.pl @@ -666,6 +666,8 @@ test_mpileup($opts,in=>[qw(indel-AD.1)],out=>'mpileup/indel-AD.1.out',ref=>'indel-AD.1.fa',args=>q[-a AD]); test_mpileup($opts,in=>[qw(mpileup-SCR)],out=>'mpileup/mpileup-SCR.out',ref=>'mpileup-SCR.fa',args=>q[-a INFO/SCR,FMT/SCR]); test_csq($opts,in=>'csq',out=>'csq.1.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.gff3'); +test_csq($opts,in=>'csq.2',out=>'csq.2.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff',tbcsq=>1); +test_csq($opts,in=>'csq.2',out=>'csq.3.out',cmd=>'-f {PATH}/csq.fa -g {PATH}/csq.2.gff --ncsq 64',tbcsq=>1); test_csq_real($opts,in=>'csq'); test_roh($opts,in=>'roh.1',out=>'roh.1.1.out',args=>q[-Or -G30 --AF-dflt 0.4]); test_roh($opts,in=>'roh.1',out=>'roh.1.1.out',args=>q[-Or -G30 --AF-file {PATH}/roh.1.tab.gz]); @@ -1599,7 +1601,14 @@ sub test_csq { my ($opts,%args) = @_; $args{cmd} =~ s/{PATH}/$$opts{path}/g; - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools csq $args{cmd} $$opts{path}/$args{in}.vcf | $$opts{bin}/test/csq/sort-csq | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%EXP\\n%POS\\t%REF\\t%ALT\\t%BCSQ\\n\\n'"); + if ( $args{tbcsq} ) + { + test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools csq $args{cmd} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools query -f'[%TBCSQ\\n]' | sed 's/\\s\\s*/\\n/g; s/,/\\n/g' | sort"); + } + else + { + test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools csq $args{cmd} $$opts{path}/$args{in}.vcf | $$opts{bin}/test/csq/sort-csq | $$opts{bin}/bcftools query -f'%POS\\t%REF\\t%ALT\\t%EXP\\n%POS\\t%REF\\t%ALT\\t%BCSQ\\n\\n'"); + } } sub test_csq_real { From 6db932398b9a061a9ce3dd7d3c613baa2aa63b76 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 10 Mar 2021 19:58:06 +0000 Subject: [PATCH 78/81] Show full diff of failed tests to help diagnose problems on automated CI platforms --- test/test.pl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test.pl b/test/test.pl index 40e8579b5..85945a894 100755 --- a/test/test.pl +++ b/test/test.pl @@ -909,8 +909,7 @@ sub test_cmd print $fh $exp; close($fh); } - my @diff = `diff $$opts{path}/$args{out} $$opts{path}/$args{out}.new | head -20`; - if ( @diff==20 ) { push @diff,"etc.\n"; } + my @diff = `diff $$opts{path}/$args{out} $$opts{path}/$args{out}.new`; for (my $i=0; $i<@diff; $i++) { $diff[$i] = "\t\t\t".$diff[$i]; } chomp($diff[-1]); failed($opts,$test,"The outputs differ:\n\t\t$$opts{path}/$args{out}\n\t\t$$opts{path}/$args{out}.new$err\n".join('',@diff)); From d10a8ffb9283ad94fcd411728245943e895413e2 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 10 Mar 2021 20:14:43 +0000 Subject: [PATCH 79/81] Replace sed one-liner with perl as substitution with newline works differently on sed implementation on MAC OS --- test/test.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.pl b/test/test.pl index 85945a894..d50a38c9b 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1602,7 +1602,7 @@ sub test_csq $args{cmd} =~ s/{PATH}/$$opts{path}/g; if ( $args{tbcsq} ) { - test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools csq $args{cmd} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools query -f'[%TBCSQ\\n]' | sed 's/\\s\\s*/\\n/g; s/,/\\n/g' | sort"); + test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools csq $args{cmd} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools query -f'[%TBCSQ\\n]' | perl -pe 's/[\\t,]/\\n/g' | sort"); } else { From 85d1bbedba4f38089eb1242bab6887cd3de02e00 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 11 Mar 2021 10:18:29 +0000 Subject: [PATCH 80/81] Tests for #1441, fail until https://github.com/samtools/htslib/pull/1254 is merged --- test/test.pl | 2 ++ test/view-t.1.out | 2 ++ test/view-t.2.out | 1 + test/view-t.vcf | 9 +++++++++ 4 files changed, 14 insertions(+) create mode 100644 test/view-t.1.out create mode 100644 test/view-t.2.out create mode 100644 test/view-t.vcf diff --git a/test/test.pl b/test/test.pl index d50a38c9b..7fdd2f96c 100755 --- a/test/test.pl +++ b/test/test.pl @@ -268,6 +268,8 @@ test_vcf_view($opts,in=>'idx.2',out=>'idx.2.out',args=>q[-H -R {PATH}/idx.2.bed]); test_vcf_view($opts,in=>'idx.3',out=>'idx.3.out',args=>q[-H -R {PATH}/idx.3.bed]); test_vcf_view($opts,in=>'idx.4',out=>'idx.4.out',args=>q[-H -R {PATH}/idx.4.bed]); +test_vcf_view($opts,in=>'view-t',out=>'view-t.1.out',args=>'-Ht 2',reg=>''); +test_vcf_view($opts,in=>'view-t',out=>'view-t.2.out',args=>'-Ht 3',reg=>''); test_vcf_64bit($opts,in=>'view64bit.1',out=>'view64bit.1.out',do_bcf=>1); test_vcf_64bit($opts,in=>'view64bit.2',out=>'view64bit.2.out',do_bcf=>1); test_vcf_64bit($opts,in=>'view64bit.3',out=>'view64bit.3.out'); # large coordinates don't work with BCF diff --git a/test/view-t.1.out b/test/view-t.1.out new file mode 100644 index 000000000..1159d5328 --- /dev/null +++ b/test/view-t.1.out @@ -0,0 +1,2 @@ +2 1 . A C . . . +2 2 . A C . . . diff --git a/test/view-t.2.out b/test/view-t.2.out new file mode 100644 index 000000000..1ae77206c --- /dev/null +++ b/test/view-t.2.out @@ -0,0 +1 @@ +3 2 . A C . . . diff --git a/test/view-t.vcf b/test/view-t.vcf new file mode 100644 index 000000000..25e46137e --- /dev/null +++ b/test/view-t.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.2 +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . A C . . . +2 1 . A C . . . +2 2 . A C . . . +3 2 . A C . . . From 5105724d8b05a82094cbc0f461dba2539f9a4a50 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 11 Mar 2021 12:20:41 +0000 Subject: [PATCH 81/81] Allow to specify the default ploidy explicitly via `--ploidy 2`. Resolves #1436 --- vcfcall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vcfcall.c b/vcfcall.c index 03cd918bd..e2aab3f95 100644 --- a/vcfcall.c +++ b/vcfcall.c @@ -190,6 +190,11 @@ static ploidy_predef_t ploidy_predefs[] = .ploidy = "* * * * 1\n" }, + { .alias = "2", + .about = "Treat all samples as diploid", + .ploidy = + "* * * * 2\n" + }, { .alias = NULL, .about = NULL, @@ -874,7 +879,7 @@ static void usage(args_t *args) fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); fprintf(stderr, " -O, --output-type b|u|z|v Output type: 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v]\n"); - fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details\n"); + fprintf(stderr, " --ploidy ASSEMBLY[?] Predefined ploidy, 'list' to print available settings, append '?' for details [2]\n"); fprintf(stderr, " --ploidy-file FILE Space/tab-delimited list of CHROM,FROM,TO,SEX,PLOIDY\n"); fprintf(stderr, " -r, --regions REGION Restrict to comma-separated list of regions\n"); fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n");