diff --git a/Makefile b/Makefile index 72195c496..822f41bdc 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,8 @@ RANLIB = ranlib htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl CPPFLAGS = +# TODO: make the 64-bit support for VCF optional via configure, for now add -DVCF_ALLOW_INT64=1 +# to CFLAGS manually, here or in config.mk if the latter exists. # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 diff --git a/README.large_positions.md b/README.large_positions.md index b0ce7ae71..3e2b2c9b4 100644 --- a/README.large_positions.md +++ b/README.large_positions.md @@ -9,7 +9,9 @@ which have, or are expected to have, chromosomes longer than two gigabases. Currently 64 bit positions can only be stored in SAM and VCF format files. Binary BAM, CRAM and BCF cannot be used due to limitations in the formats themselves. As SAM and VCF are text formats, they have no limit on the -size of numeric values. +size of numeric values. Note that while 64 bit positions are supported by +default for SAM, for VCF they must be enabled explicitly at compile time +by editing Makefile and adding -DVCF_ALLOW_INT64=1 to CFLAGS. # Compatibility issues to check diff --git a/htslib/vcf.h b/htslib/vcf.h index caedfebf4..15663d364 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1411,7 +1411,7 @@ static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) *q = (uint8_t*)p + 4; return le_to_i32(p); } else if (type == BCF_BT_INT64) { - *q = (uint8_t*)p + 4; + *q = (uint8_t*)p + 8; return le_to_i64(p); } else { // Invalid type. return 0; diff --git a/test/longrefs/index.expected2.vcf b/test/longrefs/index.expected2.vcf index 4898e2563..fed110989 100644 --- a/test/longrefs/index.expected2.vcf +++ b/test/longrefs/index.expected2.vcf @@ -1 +1 @@ -1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45 diff --git a/test/longrefs/index.vcf b/test/longrefs/index.vcf index 54c8e03d3..e861ed1a8 100644 --- a/test/longrefs/index.vcf +++ b/test/longrefs/index.vcf @@ -213,4 +213,4 @@ 1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29 1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29 -1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000;QS=1,0 PL 0,1,45 diff --git a/test/test.pl b/test/test.pl index 3e3081c5b..9c82331d7 100755 --- a/test/test.pl +++ b/test/test.pl @@ -660,18 +660,19 @@ sub test_view testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110"; testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam"; - # VCF round trip - unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index - testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf"; - testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz"; - testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_"; - - # Build index and compare with on-the-fly one made earlier. - test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1; - - # test_view can't do indexed look-ups on vcf, but we can use tabix - test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1; - test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1; + # 64-bit positions are currently not compiled in by default for VCF + # # VCF round trip + # unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index + # testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf"; + # testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz"; + # testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_"; + # + # # Build index and compare with on-the-fly one made earlier. + # test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1; + # + # # test_view can't do indexed look-ups on vcf, but we can use tabix + # test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1; + # test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1; if ($test_view_failures == 0) { passed($opts, "large position tests"); diff --git a/vcf.c b/vcf.c index 059485948..5df668fe3 100644 --- a/vcf.c +++ b/vcf.c @@ -59,10 +59,26 @@ HTSLIB_EXPORT uint32_t bcf_float_vector_end = 0x7F800002; HTSLIB_EXPORT -uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +uint8_t bcf_type_shift[] = { 0, 0, 1, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, NULL, NULL}, .id = -1 }; +/* + Partial support for 64-bit POS and Number=1 INFO tags. + Notes: + - the support for 64-bit values is motivated by POS and INFO/END for large genomes + - the use of 64-bit values does not conform to the specification + - cannot output 64-bit BCF and if it does, it is not compatible with anything + - experimental, use at your risk +*/ +#ifdef VCF_ALLOW_INT64 + #define BCF_MAX_BT_INT64 (0x7fffffffffffffff) /* INT64_MAX, for internal use only */ + #define BCF_MIN_BT_INT64 -9223372036854775800LL /* INT64_MIN + 8, for internal use only */ +#endif + +#define BCF_IS_64BIT (1<<30) + + static const char *dump_char(char *buffer, char c) { switch (c) { @@ -1251,6 +1267,14 @@ static int bcf_dec_typed_int1_safe(uint8_t *p, uint8_t *end, uint8_t **q, if (end - p < 4) return -1; *q = p + 4; *val = le_to_i32(p); +#ifdef VCF_ALLOW_INT64 + } else if (t == BCF_BT_INT64) { + // This case should never happen because there should be no 64-bit BCFs + // at all, definitely not coming from htslib + if (end - p < 8) return -1; + *q = p + 8; + *val = le_to_i64(p); +#endif } else { return -1; } @@ -1290,6 +1314,9 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { uint32_t i, reports; const uint32_t is_integer = ((1 << BCF_BT_INT8) | (1 << BCF_BT_INT16) | +#ifdef VCF_ALLOW_INT64 + (1 << BCF_BT_INT64) | +#endif (1 << BCF_BT_INT32)); const uint32_t is_valid_type = (is_integer | (1 << BCF_BT_NULL) | @@ -1728,6 +1755,12 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } bcf1_sync(v); // check if the BCF record was modified + if ( v->unpacked & BCF_IS_64BIT ) + { + hts_log_error("Data contains 64-bit values not representable in BCF. Please use VCF instead"); + return -1; + } + BGZF *fp = hfp->fp.bgzf; union { uint32_t i; @@ -2040,6 +2073,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; // FIXME: check for errs in this function } +#ifdef VCF_ALLOW_INT64 static int bcf_enc_long1(kstring_t *s, int64_t x) { uint32_t e = 0; if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) @@ -2057,6 +2091,7 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } return e == 0 ? 0 : -1; } +#endif static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { uint8_t *p; @@ -2169,6 +2204,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p { if ( !bcf_hdr_nsamples(h) ) return 0; + static int extreme_int_warned = 0; char *r, *t; int j, l, m, g; khint_t k; @@ -2362,7 +2398,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p int32_t *x = (int32_t*)(z->buf + z->size * m); for (l = 0;; ++t) { if (*t == '.') x[l++] = bcf_int32_missing, ++t; // ++t to skip "." - else x[l++] = strtol(t, &t, 10); + else + { + errno = 0; + char *te; + long int tmp_val = strtol(t, &te, 10); + if ( te==t || errno!=0 || tmp_valBCF_MAX_BT_INT32 ) + { + if ( !extreme_int_warned ) + { + hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,h->id[BCF_DT_ID][fmt[j-1].key].key,bcf_seqname(h,v), v->pos+1); + extreme_int_warned = 1; + } + tmp_val = bcf_int32_missing; + } + x[l++] = tmp_val; + t = te; + } if (*t != ',') break; } if ( !l ) x[l++] = bcf_int32_missing; @@ -2469,6 +2521,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { + static int extreme_int_warned = 0; int i = 0; char *p, *q, *r, *t; kstring_t *str; @@ -2526,6 +2579,8 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } else { v->pos -= 1; } + if (v->pos >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; } else if (i == 2) { // ID if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); else bcf_enc_size(str, 0, BCF_BT_CHAR); @@ -2672,31 +2727,66 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) val_a = z; } if ((y>>4&0xf) == BCF_HT_INT) { - // Allow first value only to be 64 bit - // (for large END value) - int64_t v64 = strtoll(val, &te, 10); - if ( te==val ) { // conversion failed - val_a[0] = bcf_int32_missing; - v64 = bcf_int64_missing; - } else { - val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing; + i = 0, t = val; + int64_t val1; +#ifdef VCF_ALLOW_INT64 + int is_int64 = 0; + if ( n_val==1 ) + { + errno = 0; + long long int tmp_val = strtoll(val, &te, 10); + if ( te==val ) tmp_val = bcf_int32_missing; + else if ( te==val || errno!=0 || tmp_valBCF_MAX_BT_INT64 ) + { + if ( !extreme_int_warned ) + { + hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1); + extreme_int_warned = 1; + } + tmp_val = bcf_int32_missing; + } + else + is_int64 = 1; + val1 = tmp_val; + t = te; + i = 1; // this is just to avoid adding another nested block... } - for (t = te; *t && *t != ','; t++); - if (*t == ',') ++t; - for (i = 1; i < n_val; ++i, ++t) +#endif + for (; i < n_val; ++i, ++t) { - val_a[i] = strtol(t, &te, 10); - if ( te==t ) // conversion failed - val_a[i] = bcf_int32_missing; + errno = 0; + long int tmp_val = strtol(t, &te, 10); + if ( te==t ) tmp_val = bcf_int32_missing; + else if ( errno!=0 || tmp_valBCF_MAX_BT_INT32 ) + { + if ( !extreme_int_warned ) + { + hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname(h,v), v->pos+1); + extreme_int_warned = 1; + } + tmp_val = bcf_int32_missing; + } + val_a[i] = tmp_val; for (t = te; *t && *t != ','; t++); } if (n_val == 1) { - bcf_enc_long1(str, v64); +#ifdef VCF_ALLOW_INT64 + if ( is_int64 ) + { + v->unpacked |= BCF_IS_64BIT; + bcf_enc_long1(str, val1); + } + else + bcf_enc_int1(str, (int32_t)val1); +#else + val1 = val_a[0]; + bcf_enc_int1(str, (int32_t)val1); +#endif } else { bcf_enc_vint(str, n_val, val_a, -1); } - if (strcmp(key, "END") == 0) - v->rlen = v64 - v->pos; + if (n_val==1 && strcmp(key, "END") == 0) + v->rlen = val1 - v->pos; } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)val_a; for (i = 0, t = val; i < n_val; ++i, ++t) @@ -3835,6 +3925,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v else bcf_enc_vchar(&str, strlen((char*)values), (char*)values); } +#ifdef VCF_ALLOW_INT64 else if ( type==BCF_HT_LONG ) { if (n != 1) { @@ -3843,6 +3934,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v } bcf_enc_long1(&str, *(int64_t *) values); } +#endif else { hts_log_error("The type %d not implemented yet", type);