From ed4c2815ee767698c187d8bf81736ce3e39e87b7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 9 Dec 2019 14:05:23 +0000 Subject: [PATCH 1/9] Fixes BCF in-memory decoding with 64-bit integers. Any 64-bit INFO field that wasn't the last in the list would cause subsequent fields to be decoded incorrectly. This commit fixes that, plus updates the tests accordingly so the bug could be triggered. Fixes the first part of #999 (test1.vcf), but doesn't fix the second part (BCF output silently being broken). Fixes samtools/bcftools#1123 --- test/longrefs/index.expected2.vcf | 2 +- test/longrefs/index.vcf | 2 +- vcf.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/longrefs/index.expected2.vcf b/test/longrefs/index.expected2.vcf index 4898e2563..fed110989 100644 --- a/test/longrefs/index.expected2.vcf +++ b/test/longrefs/index.expected2.vcf @@ -1 +1 @@ -1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45 diff --git a/test/longrefs/index.vcf b/test/longrefs/index.vcf index 54c8e03d3..e861ed1a8 100644 --- a/test/longrefs/index.vcf +++ b/test/longrefs/index.vcf @@ -213,4 +213,4 @@ 1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29 1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29 -1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45 diff --git a/vcf.c b/vcf.c index 059485948..b635c78c0 100644 --- a/vcf.c +++ b/vcf.c @@ -59,7 +59,7 @@ HTSLIB_EXPORT uint32_t bcf_float_vector_end = 0x7F800002; HTSLIB_EXPORT -uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; From 4e1d32a2956653de9e3369f0507bda8daedd4866 Mon Sep 17 00:00:00 2001 From: pd3 Date: Tue, 10 Dec 2019 17:59:37 +0100 Subject: [PATCH 2/9] Make 64-bit optional and don't compile by default. This commit addresses several problems introduced by the recently added support of 64-bit coordinates: - 64-bit positions were silently modified on BCF output - any erroneous out-of-range INFO value resulted in unparseable BCF output (in contrast to the previous behavior of a silent under/overflow) - conversion between VCF and BCF is no longer possible Large genomes is a niche area and can be easily worked around by splitting the largest chromosome into two. This commit makes the compilation of 64-bit values optional, favoring the interroperability of BCF with VCF, a feature heavily relied on in many pipelines. The support of large genomes in this commit is limited to POS and INFO tags defined as Number=1 (motivated by INFO/END), all other extreme values are replaced with a missing value ("."). This changes the original ad-hoc approach which supported first field of any INFO tag array. The default 32-bit mode adds checks to prevent under/overflow of INFO and FORMAT integer values and prints a warning when extreme values are replaced with a missing value. An additional check is added to prevent overflow of 64-bit coordinates on BCF output. Note that the 64-bit support is not compiled in by default because: - it breaks VCF/BCF specification - it works only for VCF but cannot produce a functional BCF - there is currently no front-end capable of indexing VCFs with 64bit coordinates (n_lvls is not accessible via `bcftools index`) Note that a full 64-bit support for BCF would require significantly more work and large changes of the codebase, API, and the BCF specification, therefore it is not planned at the moment. The limited POS and INFO/END support for VCF can be compiled in by editing htslib/vcf.c and setting #define ALLOW_INT64 1 --- htslib/vcf.h | 2 +- vcf.c | 117 +++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 100 insertions(+), 19 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index caedfebf4..15663d364 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1411,7 +1411,7 @@ static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) *q = (uint8_t*)p + 4; return le_to_i32(p); } else if (type == BCF_BT_INT64) { - *q = (uint8_t*)p + 4; + *q = (uint8_t*)p + 8; return le_to_i64(p); } else { // Invalid type. return 0; diff --git a/vcf.c b/vcf.c index b635c78c0..6cb9d352c 100644 --- a/vcf.c +++ b/vcf.c @@ -63,6 +63,22 @@ uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; +/* + Partial support for 64-bit POS and Number=1 INFO tags. + Notes: + - the support for 64-bit values is motivated by POS and INFO/END for large genomes + - the use of 64-bit values does not conform to the specification + - cannot output 64-bit BCF and if it does, it is not compatible with anything + - experimental, use at your risk +*/ +#define VCF_ALLOW_INT64 0 + +#if VCF_ALLOW_INT64 + #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */ + #define BCF_MIN_BT_INT64 -9223372036854775800 /* INT64_MIN + 8, for internal use only */ +#endif + + static const char *dump_char(char *buffer, char c) { switch (c) { @@ -1251,6 +1267,12 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q, if (end - p < 4) return -1; *q = p + 4; *val = le_to_i32(p); +#if VCF_ALLOW_INT64 + } else if (t == BCF_BT_INT64) { + if (end - p < 4) return -1; + *q = p + 4; + *val = le_to_i32(p); +#endif } else { return -1; } @@ -1290,6 +1312,9 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { uint32_t i, reports; const uint32_t is_integer = ((1 << BCF_BT_INT8) | (1 << BCF_BT_INT16) | +#if VCF_ALLOW_INT64 + (1 << BCF_BT_INT64) | +#endif (1 << BCF_BT_INT32)); const uint32_t is_valid_type = (is_integer | (1 << BCF_BT_NULL) | @@ -1728,6 +1753,12 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } bcf1_sync(v); // check if the BCF record was modified + if ( v->pos > BCF_MAX_BT_INT32 ) + { + hts_log_error("64-bit coordinates are not supported with BCF output\n"); + return -1; + } + BGZF *fp = hfp->fp.bgzf; union { uint32_t i; @@ -2040,6 +2071,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; // FIXME: check for errs in this function } +#if VCF_ALLOW_INT64 static int bcf_enc_long1(kstring_t *s, int64_t x) { uint32_t e = 0; if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) @@ -2057,6 +2089,7 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } return e == 0 ? 0 : -1; } +#endif static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { uint8_t *p; @@ -2169,6 +2202,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p { if ( !bcf_hdr_nsamples(h) ) return 0; + static int extreme_int_warned = 0; char *r, *t; int j, l, m, g; khint_t k; @@ -2362,7 +2396,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p int32_t *x = (int32_t*)(z->buf + z->size * m); for (l = 0;; ++t) { if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "." - else x[l++] = strtol(t, &t, 10); + else // x[l++] = strtol(t, &t, 10); + { + errno = 0; + char *te; + long int tmp_val = strtol(t, &te, 10); + if ( te==t || errno!=0 || tmp_valBCF_MAX_BT_INT32 ) + { + if ( !extreme_int_warned ) + { + hts_log_warning("Extreme FORMAT values encountered and set to missing."); + extreme_int_warned = 1; + } + tmp_val = bcf_int32_missing; + } + x[l++] = tmp_val; + t = te; + } if (*t != ',') break; } if ( !l ) x[l++] = bcf_int32_missing; @@ -2469,6 +2519,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { + static int extreme_int_warned = 0; int i = 0; char *p, *q, *r, *t; kstring_t *str; @@ -2672,31 +2723,59 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) val_a = z; } if ((y>>4&0xf) == BCF_HT_INT) { - // Allow first value only to be 64 bit - // (for large END value) - int64_t v64 = strtoll(val, &te, 10); - if ( te==val ) { // conversion failed - val_a[0] = bcf_int32_missing; - v64 = bcf_int64_missing; - } else { - val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing; + i = 0, t = val; +#if VCF_ALLOW_INT64 + int64_t val1; + if ( n_val==1 ) + { + errno = 0; + long long int tmp_val = strtoll(val, &te, 10); + if ( te==val ) tmp_val = bcf_int64_missing; + else if ( te==val || errno!=0 || tmp_valBCF_MAX_BT_INT64 ) + { + if ( !extreme_int_warned ) + { + hts_log_warning("Extreme INFO value encountered and set to missing."); + extreme_int_warned = 1; + } + tmp_val = bcf_int64_missing; + } + val1 = tmp_val; + t = te; + i = 1; // this is just to avoid adding another nested block... } - for (t = te; *t && *t != ','; t++); - if (*t == ',') ++t; - for (i = 1; i < n_val; ++i, ++t) +#else + int32_t val1; +#endif + for (; i < n_val; ++i, ++t) { - val_a[i] = strtol(t, &te, 10); - if ( te==t ) // conversion failed - val_a[i] = bcf_int32_missing; + errno = 0; + long int tmp_val = strtol(t, &te, 10); + if ( te==t ) tmp_val = bcf_int32_missing; + else if ( errno!=0 || tmp_valBCF_MAX_BT_INT32 ) + { + if ( !extreme_int_warned ) + { + hts_log_warning("Extreme INFO values encountered and set to missing."); + extreme_int_warned = 1; + } + tmp_val = bcf_int32_missing; + } + val_a[i] = tmp_val; for (t = te; *t && *t != ','; t++); } if (n_val == 1) { - bcf_enc_long1(str, v64); +#if VCF_ALLOW_INT64 + bcf_enc_long1(str, val1); +#else + val1 = val_a[0]; + bcf_enc_int1(str, val1); +#endif } else { bcf_enc_vint(str, n_val, val_a, -1); } - if (strcmp(key, "END") == 0) - v->rlen = v64 - v->pos; + if (n_val==1 && strcmp(key, "END") == 0) + v->rlen = val1 - v->pos; } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)val_a; for (i = 0, t = val; i < n_val; ++i, ++t) @@ -3835,6 +3914,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v else bcf_enc_vchar(&str, strlen((char*)values), (char*)values); } +#if VCF_ALLOW_INT64 else if ( type==BCF_HT_LONG ) { if (n != 1) { @@ -3843,6 +3923,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v } bcf_enc_long1(&str, *(int64_t *) values); } +#endif else { hts_log_error("The type %d not implemented yet", type); From 448de2e81ff42873924084e6648661e83a5a0ae6 Mon Sep 17 00:00:00 2001 From: pd3 Date: Wed, 11 Dec 2019 09:41:35 +0100 Subject: [PATCH 3/9] Make the warning messages more informative --- vcf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vcf.c b/vcf.c index 6cb9d352c..3a4d67d9a 100644 --- a/vcf.c +++ b/vcf.c @@ -1755,7 +1755,7 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) if ( v->pos > BCF_MAX_BT_INT32 ) { - hts_log_error("64-bit coordinates are not supported with BCF output\n"); + hts_log_error("64-bit coordinates are not supported with BCF output"); return -1; } @@ -2405,7 +2405,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p { if ( !extreme_int_warned ) { - hts_log_warning("Extreme FORMAT values encountered and set to missing."); + hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,h->id[BCF_DT_ID][fmt[j-1].key].key,bcf_seqname(h,v), v->pos+1); extreme_int_warned = 1; } tmp_val = bcf_int32_missing; @@ -2735,7 +2735,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { if ( !extreme_int_warned ) { - hts_log_warning("Extreme INFO value encountered and set to missing."); + hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1); extreme_int_warned = 1; } tmp_val = bcf_int64_missing; @@ -2756,7 +2756,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { if ( !extreme_int_warned ) { - hts_log_warning("Extreme INFO values encountered and set to missing."); + hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1); extreme_int_warned = 1; } tmp_val = bcf_int32_missing; From 31aa365c031e909f2d8fad403d39868c7c2b2af7 Mon Sep 17 00:00:00 2001 From: pd3 Date: Wed, 11 Dec 2019 17:13:28 +0100 Subject: [PATCH 4/9] Additional checks to prevent writing invalid BCF containing 64-bit data --- vcf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index 3a4d67d9a..64661d40f 100644 --- a/vcf.c +++ b/vcf.c @@ -78,6 +78,8 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N #define BCF_MIN_BT_INT64 -9223372036854775800 /* INT64_MIN + 8, for internal use only */ #endif +#define BCF_IS_64BIT (1<<30) + static const char *dump_char(char *buffer, char c) { @@ -1753,9 +1755,9 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } bcf1_sync(v); // check if the BCF record was modified - if ( v->pos > BCF_MAX_BT_INT32 ) + if ( v->unpacked & BCF_IS_64BIT ) { - hts_log_error("64-bit coordinates are not supported with BCF output"); + hts_log_error("Data contains 64-bit values not representable in BCF. Please use VCF instead"); return -1; } @@ -2577,6 +2579,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } else { v->pos -= 1; } + if (v->pos >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; } else if (i == 2) { // ID if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); else bcf_enc_size(str, 0, BCF_BT_CHAR); @@ -2766,6 +2770,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } if (n_val == 1) { #if VCF_ALLOW_INT64 + if ( val1BCF_MAX_BT_INT32 ) + v->unpacked |= BCF_IS_64BIT; bcf_enc_long1(str, val1); #else val1 = val_a[0]; From 29651965b067f479a5559ec10820ead475aac2e7 Mon Sep 17 00:00:00 2001 From: pd3 Date: Thu, 12 Dec 2019 16:25:37 +0100 Subject: [PATCH 5/9] Allow compilation w/o modifying vcf.c and temporarily comment out failing tests which assume 64 bit positions are compiled in by default. --- Makefile | 1 + README.large_positions.md | 4 +++- test/test.pl | 25 +++++++++++++------------ vcf.c | 16 +++++++--------- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 72195c496..3a772d4e3 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,7 @@ RANLIB = ranlib htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl CPPFLAGS = +# TODO: make the 64-bit support for VCF optional via configure, for now add -DVCF_ALLOW_INT64=1 to CFLAGS manually # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 diff --git a/README.large_positions.md b/README.large_positions.md index b0ce7ae71..3e2b2c9b4 100644 --- a/README.large_positions.md +++ b/README.large_positions.md @@ -9,7 +9,9 @@ which have, or are expected to have, chromosomes longer than two gigabases. Currently 64 bit positions can only be stored in SAM and VCF format files. Binary BAM, CRAM and BCF cannot be used due to limitations in the formats themselves. As SAM and VCF are text formats, they have no limit on the -size of numeric values. +size of numeric values. Note that while 64 bit positions are supported by +default for SAM, for VCF they must be enabled explicitly at compile time +by editing Makefile and adding -DVCF_ALLOW_INT64=1 to CFLAGS. # Compatibility issues to check diff --git a/test/test.pl b/test/test.pl index 3e3081c5b..9c82331d7 100755 --- a/test/test.pl +++ b/test/test.pl @@ -660,18 +660,19 @@ sub test_view testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110"; testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam"; - # VCF round trip - unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index - testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf"; - testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz"; - testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_"; - - # Build index and compare with on-the-fly one made earlier. - test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1; - - # test_view can't do indexed look-ups on vcf, but we can use tabix - test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1; - test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1; + # 64-bit positions are currently not compiled in by default for VCF + # # VCF round trip + # unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index + # testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf"; + # testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz"; + # testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_"; + # + # # Build index and compare with on-the-fly one made earlier. + # test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1; + # + # # test_view can't do indexed look-ups on vcf, but we can use tabix + # test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1; + # test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1; if ($test_view_failures == 0) { passed($opts, "large position tests"); diff --git a/vcf.c b/vcf.c index 64661d40f..950d22375 100644 --- a/vcf.c +++ b/vcf.c @@ -71,9 +71,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N - cannot output 64-bit BCF and if it does, it is not compatible with anything - experimental, use at your risk */ -#define VCF_ALLOW_INT64 0 - -#if VCF_ALLOW_INT64 +#ifdef VCF_ALLOW_INT64 #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */ #define BCF_MIN_BT_INT64 -9223372036854775800 /* INT64_MIN + 8, for internal use only */ #endif @@ -1269,7 +1267,7 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q, if (end - p < 4) return -1; *q = p + 4; *val = le_to_i32(p); -#if VCF_ALLOW_INT64 +#ifdef VCF_ALLOW_INT64 } else if (t == BCF_BT_INT64) { if (end - p < 4) return -1; *q = p + 4; @@ -1314,7 +1312,7 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { uint32_t i, reports; const uint32_t is_integer = ((1 << BCF_BT_INT8) | (1 << BCF_BT_INT16) | -#if VCF_ALLOW_INT64 +#ifdef VCF_ALLOW_INT64 (1 << BCF_BT_INT64) | #endif (1 << BCF_BT_INT32)); @@ -2073,7 +2071,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; // FIXME: check for errs in this function } -#if VCF_ALLOW_INT64 +#ifdef VCF_ALLOW_INT64 static int bcf_enc_long1(kstring_t *s, int64_t x) { uint32_t e = 0; if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) @@ -2728,7 +2726,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } if ((y>>4&0xf) == BCF_HT_INT) { i = 0, t = val; -#if VCF_ALLOW_INT64 +#ifdef VCF_ALLOW_INT64 int64_t val1; if ( n_val==1 ) { @@ -2769,7 +2767,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) for (t = te; *t && *t != ','; t++); } if (n_val == 1) { -#if VCF_ALLOW_INT64 +#ifdef VCF_ALLOW_INT64 if ( val1BCF_MAX_BT_INT32 ) v->unpacked |= BCF_IS_64BIT; bcf_enc_long1(str, val1); @@ -3920,7 +3918,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v else bcf_enc_vchar(&str, strlen((char*)values), (char*)values); } -#if VCF_ALLOW_INT64 +#ifdef VCF_ALLOW_INT64 else if ( type==BCF_HT_LONG ) { if (n != 1) { From 5073db389cdea9e320e1d6d266a9b0ece96d07bc Mon Sep 17 00:00:00 2001 From: pd3 Date: Fri, 13 Dec 2019 14:08:26 +0100 Subject: [PATCH 6/9] Use int32 for missing values, otherwise BCF cannot be output --- vcf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vcf.c b/vcf.c index 950d22375..025fa0fa1 100644 --- a/vcf.c +++ b/vcf.c @@ -2396,7 +2396,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p int32_t *x = (int32_t*)(z->buf + z->size * m); for (l = 0;; ++t) { if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "." - else // x[l++] = strtol(t, &t, 10); + else { errno = 0; char *te; @@ -2732,7 +2732,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { errno = 0; long long int tmp_val = strtoll(val, &te, 10); - if ( te==val ) tmp_val = bcf_int64_missing; + if ( te==val ) tmp_val = bcf_int32_missing; else if ( te==val || errno!=0 || tmp_valBCF_MAX_BT_INT64 ) { if ( !extreme_int_warned ) @@ -2740,7 +2740,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1); extreme_int_warned = 1; } - tmp_val = bcf_int64_missing; + tmp_val = bcf_int32_missing; } val1 = tmp_val; t = te; From aa0a090f9113e4b25c15fad2a51eedc71ddfa48b Mon Sep 17 00:00:00 2001 From: pd3 Date: Fri, 13 Dec 2019 15:18:42 +0100 Subject: [PATCH 7/9] Handle another case of forgotten 64-bit values that can pass unnoticed to BCF --- vcf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vcf.c b/vcf.c index 025fa0fa1..119192063 100644 --- a/vcf.c +++ b/vcf.c @@ -2769,8 +2769,12 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if (n_val == 1) { #ifdef VCF_ALLOW_INT64 if ( val1BCF_MAX_BT_INT32 ) + { v->unpacked |= BCF_IS_64BIT; - bcf_enc_long1(str, val1); + bcf_enc_long1(str, val1); + } + else + bcf_enc_int1(str, val1); #else val1 = val_a[0]; bcf_enc_int1(str, val1); From 4929c3d1ed7b66dbfe6a0238c24a6df21f4edf14 Mon Sep 17 00:00:00 2001 From: pd3 Date: Fri, 13 Dec 2019 15:30:01 +0100 Subject: [PATCH 8/9] Distinguish between genuine missing value and out-of-range replacement --- vcf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vcf.c b/vcf.c index 119192063..d41a788e8 100644 --- a/vcf.c +++ b/vcf.c @@ -2727,6 +2727,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) if ((y>>4&0xf) == BCF_HT_INT) { i = 0, t = val; #ifdef VCF_ALLOW_INT64 + int is_int64 = 0; int64_t val1; if ( n_val==1 ) { @@ -2742,6 +2743,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } tmp_val = bcf_int32_missing; } + else + is_int64 = 1; val1 = tmp_val; t = te; i = 1; // this is just to avoid adding another nested block... @@ -2768,7 +2771,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } if (n_val == 1) { #ifdef VCF_ALLOW_INT64 - if ( val1BCF_MAX_BT_INT32 ) + if ( is_int64 ) { v->unpacked |= BCF_IS_64BIT; bcf_enc_long1(str, val1); From ecc69cf62cb1d08129c0a3f158081a299884a08f Mon Sep 17 00:00:00 2001 From: pd3 Date: Mon, 16 Dec 2019 11:57:55 +0100 Subject: [PATCH 9/9] Polish a few minor details - bug fix in `bcf_dec_typed_int1_safe()`, 64-bit values could not have been read correctly by this function - cosmetic change to declare val1 always as int64_t and explicit cast to int32_t and add explicit LL to a 64-bit constant in case some compilers were fussy --- Makefile | 3 ++- vcf.c | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 3a772d4e3..822f41bdc 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,8 @@ RANLIB = ranlib htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl CPPFLAGS = -# TODO: make the 64-bit support for VCF optional via configure, for now add -DVCF_ALLOW_INT64=1 to CFLAGS manually +# TODO: make the 64-bit support for VCF optional via configure, for now add -DVCF_ALLOW_INT64=1 +# to CFLAGS manually, here or in config.mk if the latter exists. # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 diff --git a/vcf.c b/vcf.c index d41a788e8..5df668fe3 100644 --- a/vcf.c +++ b/vcf.c @@ -72,8 +72,8 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N - experimental, use at your risk */ #ifdef VCF_ALLOW_INT64 - #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */ - #define BCF_MIN_BT_INT64 -9223372036854775800 /* INT64_MIN + 8, for internal use only */ + #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */ + #define BCF_MIN_BT_INT64 -9223372036854775800LL /* INT64_MIN + 8, for internal use only */ #endif #define BCF_IS_64BIT (1<<30) @@ -1269,9 +1269,11 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q, *val = le_to_i32(p); #ifdef VCF_ALLOW_INT64 } else if (t == BCF_BT_INT64) { - if (end - p < 4) return -1; - *q = p + 4; - *val = le_to_i32(p); + // This case should never happen because there should be no 64-bit BCFs + // at all, definitely not coming from htslib + if (end - p < 8) return -1; + *q = p + 8; + *val = le_to_i64(p); #endif } else { return -1; @@ -2726,9 +2728,9 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } if ((y>>4&0xf) == BCF_HT_INT) { i = 0, t = val; + int64_t val1; #ifdef VCF_ALLOW_INT64 int is_int64 = 0; - int64_t val1; if ( n_val==1 ) { errno = 0; @@ -2749,8 +2751,6 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) t = te; i = 1; // this is just to avoid adding another nested block... } -#else - int32_t val1; #endif for (; i < n_val; ++i, ++t) { @@ -2777,10 +2777,10 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) bcf_enc_long1(str, val1); } else - bcf_enc_int1(str, val1); + bcf_enc_int1(str, (int32_t)val1); #else val1 = val_a[0]; - bcf_enc_int1(str, val1); + bcf_enc_int1(str, (int32_t)val1); #endif } else { bcf_enc_vint(str, n_val, val_a, -1);