From 186d21bf50e6ce6d78e4ef429bca5ffe5c016744 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 9 Apr 2024 17:13:46 +0100 Subject: [PATCH 1/2] Increase the input block size for bgzip. Commit e495718 changed bgzip from unix raw POSIX read() calls to hread(). Unfortunately hread gets its buffer size from stat of the input file descriptor, which can be 4kb for a pipe. We're reading 0xff00 bytes, so this ends up being split over two reads mostly, with one or both involving additional memcpys. This makes the buffered I/O worse performing than non-buffered. In the most extreme cases (cat data | bgzip -l0 > /dev/null) this is a two fold slow down. The easy solution is just to increase the buffer size to something sensible. Currently we play it cautiously and only do this on pipes and fifos. Fixes #1767 --- hfile.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hfile.c b/hfile.c index 552b71774..20245db43 100644 --- a/hfile.c +++ b/hfile.c @@ -107,9 +107,11 @@ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) hFILE *fp = (hFILE *) malloc(struct_size); if (fp == NULL) goto error; - if (capacity == 0) capacity = 32768; + const int maxcap = 128*1024; + + if (capacity == 0) capacity = maxcap; // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory - if (strchr(mode, 'r') && capacity > 32768) capacity = 32768; + if (strchr(mode, 'r') && capacity > maxcap) capacity = maxcap; fp->buffer = (char *) malloc(capacity); if (fp->buffer == NULL) goto error; @@ -629,7 +631,12 @@ static size_t blksize(int fd) #ifdef HAVE_STRUCT_STAT_ST_BLKSIZE struct stat sbuf; if (fstat(fd, &sbuf) != 0) return 0; - return sbuf.st_blksize; + + // Pipes/FIFOs on linux return 4Kb here often, but it's much too small + // for performant I/O. + return S_ISFIFO(sbuf.st_mode) + ? 128*1024 + : sbuf.st_blksize; #else return 0; #endif From d1be3c2459864b64533257111e8dbaa62a6c427c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 14 Nov 2024 12:29:11 +0000 Subject: [PATCH 2/2] Make use of posix_memalign for hfile buffer. On AMD EPYC 7713 aligning to cache size boundaries makes a very significant difference to fp->backend->read performance in the kernel. A modern Intel CPU did not demonstrate this difference. x86 often have cache line size of 64 bytes, and apple Arm chips 128 bytes. I haven't tested if Arm benefits from alignment during read calls, but we can check size with sysconf(_SC_LEVEL1_DCACHE_LINESIZE). However to avoid additional autoconfery I just picked 256 as it gives us headroom and is simple. Speed ups on the AMD EPYC: time bash -c 'for i in `seq 1 30`;do cat < ~/lustre/enwik9| ./bgzip -l5 -@32 > /dev/null;done' Unaligned real 0m45.012s user 10m7.661s sys 0m58.770s Aligned real 0m30.717s user 11m14.004s sys 0m32.921s It is likely this could improve other bits of code too. --- configure.ac | 2 +- hfile.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 73f7a2c2f..d860a74fe 100644 --- a/configure.ac +++ b/configure.ac @@ -326,7 +326,7 @@ HTS_HIDE_DYNAMIC_SYMBOLS dnl FIXME This pulls in dozens of standard header checks AC_FUNC_MMAP -AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic getauxval elf_aux_info]) +AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic getauxval elf_aux_info posix_memalign]) # Darwin has a dubious fdatasync() symbol, but no declaration in AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)]) diff --git a/hfile.c b/hfile.c index 20245db43..3b60bedda 100644 --- a/hfile.c +++ b/hfile.c @@ -113,8 +113,14 @@ hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity) // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory if (strchr(mode, 'r') && capacity > maxcap) capacity = maxcap; +#ifdef HAVE_POSIX_MEMALIGN + fp->buffer = NULL; + if (posix_memalign((void **)&fp->buffer, 256, capacity) < 0) + goto error; +#else fp->buffer = (char *) malloc(capacity); if (fp->buffer == NULL) goto error; +#endif fp->begin = fp->end = fp->buffer; fp->limit = &fp->buffer[capacity];