Skip to content

Commit

Permalink
Work around issues with Mingw64 having a tiny stack.
Browse files Browse the repository at this point in the history
The updates to hist1_4 and hist8 use large local buffers.  This fails on
mingw64 unless e.g. -Wl,--stack,4000000 is used.
The alternative is the same strategy employed in rANS elsewhere of
using pthread_once to allocate the local memory as a global pointer,
once per thread in use.

If threads aren't available it reverts to a traditional malloc/free
cycle.  (This was benchmarked and it's generally still better for large
blocks than the older 8-bit frequency counting methods.)
  • Loading branch information
jkbonfield committed Feb 14, 2022
1 parent c5f50a1 commit 7b398b4
Showing 1 changed file with 66 additions and 5 deletions.
71 changes: 66 additions & 5 deletions htscodecs/utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019,2021 Genome Research Ltd.
* Copyright (c) 2019-2022 Genome Research Ltd.
* Author(s): James Bonfield
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -32,6 +32,28 @@
*/

#include <string.h>
#include <stdlib.h>

// pthread_once is used to allocate large local memory blocks in hist8
// and hist1_4. This avoids issues with systems having small stacks, and
// removes overheads of repeated malloc/free cycles.
//
// If NO_THREADS is defined it reverts back to the more traditional malloc
// and free instead.
#ifndef NO_THREADS
#include <pthread.h>

static pthread_once_t hist_once = PTHREAD_ONCE_INIT;
static pthread_key_t hist_key;

static void hist_tb_free(void *vp) {
free(vp);
}

static void hist_tls_init(void) {
pthread_key_create(&hist_key, hist_tb_free);
}
#endif

/*
* Data transpose by N. Common to rANS4x16 and arith_dynamic decoders.
Expand Down Expand Up @@ -107,9 +129,24 @@ static inline void unstripe(unsigned char *out, unsigned char *outN,
static inline
void hist8(unsigned char *in, unsigned int in_size, uint32_t F0[256]) {
if (in_size > 500000) {
uint32_t f0[65536+37] = {0};
uint32_t f1[65536+37] = {0};
uint32_t f2[65536+37] = {0};
// Note this is a static inline so we allocate more blocks than
// we really need, so we could consider moving this initialisation
// code to a utils.c instead.
// However it's relatively bounded and not a leak.
#ifndef NO_THREADS
pthread_once(&hist_once, hist_tls_init);
uint32_t *f0 = pthread_getspecific(hist_key);
if (!f0) {
f0 = calloc((65536+37)*3, sizeof(uint32_t));
pthread_setspecific(hist_key, f0);
} else {
memset(f0, 0, (65536+37)*3*sizeof(*f0));
}
#else
uint32_t *f0 = calloc((65536+37)*3, sizeof(*f0));
#endif
uint32_t *f1 = f0 + 65536+37;
uint32_t *f2 = f1 + 65536+37;

uint32_t i, i8 = in_size & ~15;

Expand All @@ -135,6 +172,9 @@ void hist8(unsigned char *in, unsigned int in_size, uint32_t F0[256]) {
F0[i & 0xff] += f0[i] + f1[i] + f2[i];
F0[i >> 8 ] += f0[i] + f1[i] + f2[i];
}
#ifdef NO_THREADS
free(f0);
#endif
} else {
uint32_t F1[256+MAGIC] = {0}, F2[256+MAGIC] = {0}, F3[256+MAGIC] = {0};
uint32_t i, i8 = in_size & ~7;
Expand Down Expand Up @@ -199,7 +239,25 @@ void hist1_4(unsigned char *in, unsigned int in_size,

unsigned char cc[5] = {0};
if (in_size > 500000) {
uint32_t F1[256][259] = {{0}};
#ifndef NO_THREADS
pthread_once(&hist_once, hist_tls_init);
// NB (*F1)[259] works better on old Opterons or other systems
// with low N-way associative caches. However it's slightly poorer
// on more modern CPUs. Mileage may vary by CPU and compiler.
//
// Note our pthread_once key is shared by both this and hist8, so
// the buffer allocated here is reused by both functions and has to
// be allocated accordingly (hence to the larger hist8 size).
uint32_t (*F1)[259] = pthread_getspecific(hist_key);
if (!F1) {
F1 = calloc((65536+37)*3, sizeof(uint32_t));
pthread_setspecific(hist_key, F1);
} else {
memset(F1, 0, 256*sizeof(*F1));
}
#else
uint32_t (*F1)[259] = calloc(256, sizeof(*F1));
#endif
while (in < in_end-8) {
memcpy(cc, in, 4); in += 4;
F0[cc[4]][cc[0]]++;
Expand Down Expand Up @@ -232,6 +290,9 @@ void hist1_4(unsigned char *in, unsigned int in_size,
}
T0[i]+=tt;
}
#ifdef NO_THREADS
free(F1);
#endif
} else {
while (in < in_end-8) {
memcpy(cc, in, 4); in += 4;
Expand Down

0 comments on commit 7b398b4

Please sign in to comment.