Skip to content

Commit

Permalink
Pulling latest from release branch and not dev
Browse files Browse the repository at this point in the history
  • Loading branch information
SirSniper committed Sep 11, 2021
1 parent 95346f4 commit 950fb01
Show file tree
Hide file tree
Showing 23 changed files with 595 additions and 1,158 deletions.
9 changes: 0 additions & 9 deletions bitstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,16 +332,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
U32 const regMask = sizeof(bitContainer)*8 - 1;
/* if start > regMask, bitstream is corrupted, and result is undefined */
assert(nbBits < BIT_MASK_SIZE);
/* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better
* than accessing memory. When bmi2 instruction is not present, we consider
* such cpus old (pre-Haswell, 2013) and their performance is not of that
* importance.
*/
#if defined(__x86_64__) || defined(_M_X86)
return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1);
#else
return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
#endif
}

MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
Expand Down
21 changes: 2 additions & 19 deletions compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,8 @@
}

/* vectorization
* older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
* and some compilers, like Intel ICC and MCST LCC, do not support it at all. */
#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__)
* older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */
#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__)
# if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
# define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
# else
Expand Down Expand Up @@ -198,22 +197,6 @@
#define STATIC_BMI2 0
#endif

/* compile time determination of SIMD support */
#if !defined(ZSTD_NO_INTRINSICS)
# if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
# define ZSTD_ARCH_X86_SSE2
# endif
# if defined(__ARM_NEON) || defined(_M_ARM64)
# define ZSTD_ARCH_ARM_NEON
# endif
#
# if defined(ZSTD_ARCH_X86_SSE2)
# include <emmintrin.h>
# elif defined(ZSTD_ARCH_ARM_NEON)
# include <arm_neon.h>
# endif
#endif

/* compat. with non-clang compilers */
#ifndef __has_builtin
# define __has_builtin(x) 0
Expand Down
2 changes: 1 addition & 1 deletion entropy_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
weightTotal = 0;
{ U32 n; for (n=0; n<oSize; n++) {
if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
rankStats[huffWeight[n]]++;
weightTotal += (1 << huffWeight[n]) >> 1;
} }
Expand Down
3 changes: 1 addition & 2 deletions fse.h
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,8 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
/* FSE_buildCTable_wksp() :
* Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
* `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`.
* See FSE_buildCTable_wksp() for breakdown of workspace usage.
*/
#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */)
#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (maxSymbolValue + 2 + (1ull << (tableLog - 2)))
#define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog))
size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);

Expand Down
84 changes: 26 additions & 58 deletions fse_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,13 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
U32 const step = FSE_TABLESTEP(tableSize);
U32 const maxSV1 = maxSymbolValue+1;

U16* cumul = (U16*)workSpace; /* size = maxSV1 */
FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */
U32* cumul = (U32*)workSpace;
FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2));

U32 highThreshold = tableSize-1;

assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */
if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte aligned */
if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge);
/* CTable header */
tableU16[-2] = (U16) tableLog;
Expand All @@ -99,61 +98,20 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
/* symbol start positions */
{ U32 u;
cumul[0] = 0;
for (u=1; u <= maxSV1; u++) {
for (u=1; u <= maxSymbolValue+1; u++) {
if (normalizedCounter[u-1]==-1) { /* Low proba symbol */
cumul[u] = cumul[u-1] + 1;
tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
} else {
assert(normalizedCounter[u-1] >= 0);
cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1];
assert(cumul[u] >= cumul[u-1]); /* no overflow */
cumul[u] = cumul[u-1] + normalizedCounter[u-1];
} }
cumul[maxSV1] = (U16)(tableSize+1);
cumul[maxSymbolValue+1] = tableSize+1;
}

/* Spread symbols */
if (highThreshold == tableSize - 1) {
/* Case for no low prob count symbols. Lay down 8 bytes at a time
* to reduce branch misses since we are operating on a small block
*/
BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */
{ U64 const add = 0x0101010101010101ull;
size_t pos = 0;
U64 sv = 0;
U32 s;
for (s=0; s<maxSV1; ++s, sv += add) {
int i;
int const n = normalizedCounter[s];
MEM_write64(spread + pos, sv);
for (i = 8; i < n; i += 8) {
MEM_write64(spread + pos + i, sv);
}
assert(n>=0);
pos += (size_t)n;
}
}
/* Spread symbols across the table. Lack of lowprob symbols means that
* we don't need variable sized inner loop, so we can unroll the loop and
* reduce branch misses.
*/
{ size_t position = 0;
size_t s;
size_t const unroll = 2; /* Experimentally determined optimal unroll */
assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
for (s = 0; s < (size_t)tableSize; s += unroll) {
size_t u;
for (u = 0; u < unroll; ++u) {
size_t const uPosition = (position + (u * step)) & tableMask;
tableSymbol[uPosition] = spread[s + u];
}
position = (position + (unroll * step)) & tableMask;
}
assert(position == 0); /* Must have initialized all positions */
}
} else {
U32 position = 0;
{ U32 position = 0;
U32 symbol;
for (symbol=0; symbol<maxSV1; symbol++) {
for (symbol=0; symbol<=maxSymbolValue; symbol++) {
int nbOccurrences;
int const freq = normalizedCounter[symbol];
for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
Expand All @@ -162,6 +120,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
while (position > highThreshold)
position = (position + step) & tableMask; /* Low proba area */
} }

assert(position==0); /* Must have initialized all positions */
}

Expand All @@ -185,17 +144,16 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
case -1:
case 1:
symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
assert(total <= INT_MAX);
symbolTT[s].deltaFindState = (int)(total - 1);
symbolTT[s].deltaFindState = total - 1;
total ++;
break;
default :
assert(normalizedCounter[s] > 1);
{ U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
{
U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
total += (unsigned)normalizedCounter[s];
symbolTT[s].deltaFindState = total - normalizedCounter[s];
total += normalizedCounter[s];
} } } }

#if 0 /* debug : symbol costs */
Expand All @@ -206,16 +164,26 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
symbol, normalizedCounter[symbol],
FSE_getMaxNbBits(symbolTT, symbol),
(double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
} }
}
}
#endif

return 0;
}

#ifndef ZSTD_NO_UNUSED_FUNCTIONS
size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
{
FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE]; /* memset() is not necessary, even if static analyzer complain about it */
return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
}
#endif



#ifndef FSE_COMMONDEFS_ONLY


/*-**************************************************************
* FSE NCount encoding
****************************************************************/
Expand Down
39 changes: 20 additions & 19 deletions huf.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,

/** HUF_compress4X_wksp() :
* Same as HUF_compress2(), but uses externally allocated `workSpace`.
* `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
* `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
#define HUF_WORKSPACE_SIZE ((6 << 10) + 256)
#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog,
Expand Down Expand Up @@ -136,11 +136,15 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,

/* static allocation of HUF's Compression Table */
/* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */
typedef size_t HUF_CElt; /* consider it an incomplete type */
#define HUF_CTABLE_SIZE_ST(maxSymbolValue) ((maxSymbolValue)+2) /* Use tables of size_t, for proper alignment */
#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t))
struct HUF_CElt_s {
U16 val;
BYTE nbBits;
}; /* typedef'd to HUF_CElt */
typedef struct HUF_CElt_s HUF_CElt; /* consider it an incomplete type */
#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */
#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */
HUF_CElt name[HUF_CTABLE_SIZE_U32(maxSymbolValue)] /* no final ; */

/* static allocation of HUF's DTable */
typedef U32 HUF_DTable;
Expand Down Expand Up @@ -190,7 +194,6 @@ size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSym
size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);

Expand All @@ -203,13 +206,12 @@ typedef enum {
* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
* If it uses hufTable it does not modify hufTable or repeat.
* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
* If preferRepeat then the old table will always be used if valid.
* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
* If preferRepeat then the old table will always be used if valid. */
size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);

/** HUF_buildCTable_wksp() :
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.
Expand Down Expand Up @@ -247,10 +249,11 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
* Loading a CTable saved with HUF_writeCTable() */
size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);

/** HUF_getNbBitsFromCTable() :
/** HUF_getNbBits() :
* Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
* Note 1 : is not inlined, as HUF_CElt definition is private */
U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
* Note 1 : is not inlined, as HUF_CElt definition is private
* Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */
U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);

/*
* HUF_decompress() does the following:
Expand Down Expand Up @@ -302,20 +305,18 @@ size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* c
/* ====================== */

size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
/** HUF_compress1X_repeat() :
* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
* If it uses hufTable it does not modify hufTable or repeat.
* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
* If preferRepeat then the old table will always be used if valid.
* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
* If preferRepeat then the old table will always be used if valid. */
size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned tableLog,
void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);

size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */
#ifndef HUF_FORCE_DECOMPRESS_X1
Expand Down
Loading

0 comments on commit 950fb01

Please sign in to comment.