From ed4c2815ee767698c187d8bf81736ce3e39e87b7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 9 Dec 2019 14:05:23 +0000 Subject: [PATCH 1/3] Fixes BCF in-memory decoding with 64-bit integers. Any 64-bit INFO field that wasn't the last in the list would cause subsequent fields to be decoded incorrectly. This commit fixes that, plus updates the tests accordingly so the bug could be triggered. Fixes the first part of #999 (test1.vcf), but doesn't fix the second part (BCF output silently being broken). Fixes samtools/bcftools#1123 --- test/longrefs/index.expected2.vcf | 2 +- test/longrefs/index.vcf | 2 +- vcf.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/longrefs/index.expected2.vcf b/test/longrefs/index.expected2.vcf index 4898e2563..fed110989 100644 --- a/test/longrefs/index.expected2.vcf +++ b/test/longrefs/index.expected2.vcf @@ -1 +1 @@ -1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45 diff --git a/test/longrefs/index.vcf b/test/longrefs/index.vcf index 54c8e03d3..e861ed1a8 100644 --- a/test/longrefs/index.vcf +++ b/test/longrefs/index.vcf @@ -213,4 +213,4 @@ 1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29 1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29 -1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45 diff --git a/vcf.c b/vcf.c index 059485948..b635c78c0 100644 --- a/vcf.c +++ b/vcf.c @@ -59,7 +59,7 @@ HTSLIB_EXPORT uint32_t bcf_float_vector_end = 0x7F800002; HTSLIB_EXPORT -uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; From 66c66f23caf404b8bb50295389abbe8a0dd9f682 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 11 Dec 2019 12:10:12 +0000 Subject: [PATCH 2/3] Additional checks to prevent writing invalid BCF containing 64-bit data --- vcf.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vcf.c b/vcf.c index b635c78c0..254d4f5b0 100644 --- a/vcf.c +++ b/vcf.c @@ -47,6 +47,10 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" +// A flag for bcf1_t->unpacked to indicate this contains 64-bit data. +// Prevents us accidentally attempting to write invalid BCF records. +#define BCF_IS_64BIT (1<<30) + #include "htslib/khash.h" KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) typedef khash_t(vdict) vdict_t; @@ -1728,6 +1732,11 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } bcf1_sync(v); // check if the BCF record was modified + if (v->unpacked & BCF_IS_64BIT) { + hts_log_error("Data contains 64-bit values not representable in BCF. Please use VCF instead"); + return -1; + } + BGZF *fp = hfp->fp.bgzf; union { uint32_t i; @@ -2526,6 +2535,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } else { v->pos -= 1; } + if (v->pos >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; } else if (i == 2) { // ID if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); else bcf_enc_size(str, 0, BCF_BT_CHAR); @@ -2691,6 +2702,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) for (t = te; *t && *t != ','; t++); } if (n_val == 1) { + if (val_a[0] != v64 && v64 != bcf_int64_missing) + v->unpacked |= BCF_IS_64BIT; bcf_enc_long1(str, v64); } else { bcf_enc_vint(str, n_val, val_a, -1); From 92297b28bec0e32f3b7d0c559d3187fdf73fef85 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 12 Dec 2019 10:08:23 +0000 Subject: [PATCH 3/3] Limit 64-bit info to END tag (as suggested by Rob Davies). Also adds check for rlen (END) field when trying to output to BCF. Note: this will emit a warning per out-of-range value, but it is expected that the frequency of these is very low. If not, consider this a good icentive to fix your pipelines! --- vcf.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index 254d4f5b0..0efd99c47 100644 --- a/vcf.c +++ b/vcf.c @@ -2690,7 +2690,16 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) val_a[0] = bcf_int32_missing; v64 = bcf_int64_missing; } else { - val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing; + if (v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32) { + val_a[0] = v64; + } else { + val_a[0] = bcf_int32_missing; + // Only permit 64-bit INFO values for END tag + if (strcmp(key, "END") != 0) { + hts_log_warning("INFO key %s has out of range value %"PRId64, key, v64); + v64 = bcf_int64_missing; + } + } } for (t = te; *t && *t != ','; t++); if (*t == ',') ++t; @@ -2708,8 +2717,11 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } else { bcf_enc_vint(str, n_val, val_a, -1); } - if (strcmp(key, "END") == 0) + if (strcmp(key, "END") == 0) { v->rlen = v64 - v->pos; + if (v->rlen >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; + } } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)val_a; for (i = 0, t = val; i < n_val; ++i, ++t)