diff --git a/README.md b/README.md index f3b215a..f357409 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ [C Zstd Homepage](https://github.com/facebook/zstd) -The current headers and C files are from *v1.4.4* (Commit -[10f0e699](https://github.com/facebook/zstd/releases/tag/v1.4.4)). +The current headers and C files are from *v1.5.0* (Commit +[10f0e699](https://github.com/facebook/zstd/releases/tag/v1.5.0)). ## Usage diff --git a/bitstream.h b/bitstream.h index d9a2730..48aad7f 100644 --- a/bitstream.h +++ b/bitstream.h @@ -1,7 +1,7 @@ /* ****************************************************************** * bitstream * Part of FSE library - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -332,7 +332,16 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c U32 const regMask = sizeof(bitContainer)*8 - 1; /* if start > regMask, bitstream is corrupted, and result is undefined */ assert(nbBits < BIT_MASK_SIZE); + /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is better + * than accessing memory. When bmi2 instruction is not present, we consider + * such cpus old (pre-Haswell, 2013) and their performance is not of that + * importance. + */ +#if defined(__x86_64__) || defined(_M_X86) + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); +#else return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; +#endif } MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) diff --git a/compiler.h b/compiler.h index 3e454f3..012ff02 100644 --- a/compiler.h +++ b/compiler.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -90,6 +90,7 @@ # endif #endif + /* target attribute */ #ifndef __has_attribute #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */ @@ -149,8 +150,9 @@ } /* vectorization - * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */ -#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, + * and some compilers, like Intel ICC and MCST LCC, do not support it at all. */ +#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) && !defined(__LCC__) # if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5) # define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) # else @@ -196,6 +198,22 @@ #define STATIC_BMI2 0 #endif +/* compile time determination of SIMD support */ +#if !defined(ZSTD_NO_INTRINSICS) +# if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) +# define ZSTD_ARCH_X86_SSE2 +# endif +# if defined(__ARM_NEON) || defined(_M_ARM64) +# define ZSTD_ARCH_ARM_NEON +# endif +# +# if defined(ZSTD_ARCH_X86_SSE2) +# include +# elif defined(ZSTD_ARCH_ARM_NEON) +# include +# endif +#endif + /* compat. with non-clang compilers */ #ifndef __has_builtin # define __has_builtin(x) 0 diff --git a/cover.c b/cover.c index 697f483..49d7d5f 100644 --- a/cover.c +++ b/cover.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -26,15 +26,16 @@ #include /* memset */ #include /* clock */ +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + #include "mem.h" /* read */ #include "pool.h" #include "threading.h" -#include "cover.h" #include "zstd_internal.h" /* includes zstd.h */ -#ifndef ZDICT_STATIC_LINKING_ONLY -#define ZDICT_STATIC_LINKING_ONLY -#endif #include "zdict.h" +#include "cover.h" /*-************************************* * Constants @@ -1062,18 +1063,19 @@ typedef struct COVER_tryParameters_data_s { * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ -static void COVER_tryParameters(void *opaque) { +static void COVER_tryParameters(void *opaque) +{ /* Save parameters as local variables */ - COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque; + COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque; const COVER_ctx_t *const ctx = data->ctx; const ZDICT_cover_params_t parameters = data->parameters; size_t dictBufferCapacity = data->dictBufferCapacity; size_t totalCompressedSize = ERROR(GENERIC); /* Allocate space for hash table, dict, and freqs */ COVER_map_t activeDmers; - BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + BYTE* const dict = (BYTE*)malloc(dictBufferCapacity); COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); - U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32)); + U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32)); if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) { DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n"); goto _cleanup; @@ -1103,15 +1105,14 @@ static void COVER_tryParameters(void *opaque) { free(data); COVER_map_destroy(&activeDmers); COVER_dictSelectionFree(selection); - if (freqs) { - free(freqs); - } + free(freqs); } ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( - void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, - const size_t *samplesSizes, unsigned nbSamples, - ZDICT_cover_params_t *parameters) { + void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, + const size_t* samplesSizes, unsigned nbSamples, + ZDICT_cover_params_t* parameters) +{ /* constants */ const unsigned nbThreads = parameters->nbThreads; const double splitPoint = diff --git a/cover.h b/cover.h index f2fbc53..c3df990 100644 --- a/cover.h +++ b/cover.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, Facebook, Inc. + * Copyright (c) Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -8,6 +8,10 @@ * You may select, at your option, one of the above-listed licenses. */ +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + #include /* fprintf */ #include /* malloc, free, qsort */ #include /* memset */ @@ -16,9 +20,6 @@ #include "pool.h" #include "threading.h" #include "zstd_internal.h" /* includes zstd.h */ -#ifndef ZDICT_STATIC_LINKING_ONLY -#define ZDICT_STATIC_LINKING_ONLY -#endif #include "zdict.h" /** diff --git a/cpu.h b/cpu.h index cb21059..8acd33b 100644 --- a/cpu.h +++ b/cpu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, Facebook, Inc. + * Copyright (c) Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/debug.c b/debug.c index f303f4a..bb863c9 100644 --- a/debug.c +++ b/debug.c @@ -1,7 +1,7 @@ /* ****************************************************************** * debug * Part of FSE library - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/debug.h b/debug.h index 8b57343..3b2a320 100644 --- a/debug.h +++ b/debug.h @@ -1,7 +1,7 @@ /* ****************************************************************** * debug * Part of FSE library - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/divsufsort.c b/divsufsort.c index ead9220..a2870fb 100644 --- a/divsufsort.c +++ b/divsufsort.c @@ -1576,7 +1576,7 @@ sort_typeBstar(const unsigned char *T, int *SA, /* Construct the inverse suffix array of type B* suffixes using trsort. */ trsort(ISAb, SA, m, 1); - /* Set the sorted order of tyoe B* suffixes. */ + /* Set the sorted order of type B* suffixes. */ for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) { for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { } if(0 <= i) { diff --git a/entropy_common.c b/entropy_common.c index f9fcb1a..6ba1f22 100644 --- a/entropy_common.c +++ b/entropy_common.c @@ -1,6 +1,6 @@ /* ****************************************************************** * Common functions of New Generation Entropy library - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -299,7 +299,7 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); weightTotal = 0; { U32 n; for (n=0; n= HUF_TABLELOG_MAX) return ERROR(corruption_detected); + if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_detected); rankStats[huffWeight[n]]++; weightTotal += (1 << huffWeight[n]) >> 1; } } diff --git a/error_private.c b/error_private.c index 45bba53..6d1135f 100644 --- a/error_private.c +++ b/error_private.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/error_private.h b/error_private.h index 71b37b8..a8d33c8 100644 --- a/error_private.h +++ b/error_private.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -21,8 +21,8 @@ extern "C" { /* **************************************** * Dependencies ******************************************/ -#include "zstd_deps.h" /* size_t */ #include "zstd_errors.h" /* enum list */ +#include "zstd_deps.h" /* size_t */ /* **************************************** diff --git a/fastcover.c b/fastcover.c index dad521c..13406cd 100644 --- a/fastcover.c +++ b/fastcover.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, Facebook, Inc. + * Copyright (c) Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -16,16 +16,17 @@ #include /* memset */ #include /* clock */ +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif + #include "mem.h" /* read */ #include "pool.h" #include "threading.h" -#include "cover.h" #include "zstd_internal.h" /* includes zstd.h */ #include "zstd_compress_internal.h" /* ZSTD_hash*() */ -#ifndef ZDICT_STATIC_LINKING_ONLY -#define ZDICT_STATIC_LINKING_ONLY -#endif #include "zdict.h" +#include "cover.h" /*-************************************* @@ -462,20 +463,20 @@ typedef struct FASTCOVER_tryParameters_data_s { * This function is thread safe if zstd is compiled with multithreaded support. * It takes its parameters as an *OWNING* opaque pointer to support threading. */ -static void FASTCOVER_tryParameters(void *opaque) +static void FASTCOVER_tryParameters(void* opaque) { /* Save parameters as local variables */ - FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t *)opaque; + FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t*)opaque; const FASTCOVER_ctx_t *const ctx = data->ctx; const ZDICT_cover_params_t parameters = data->parameters; size_t dictBufferCapacity = data->dictBufferCapacity; size_t totalCompressedSize = ERROR(GENERIC); /* Initialize array to keep track of frequency of dmer within activeSegment */ - U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16)); + U16* segmentFreqs = (U16*)calloc(((U64)1 << ctx->f), sizeof(U16)); /* Allocate space for hash table, dict, and freqs */ - BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity); + BYTE *const dict = (BYTE*)malloc(dictBufferCapacity); COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC)); - U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); + U32* freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32)); if (!segmentFreqs || !dict || !freqs) { DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n"); goto _cleanup; diff --git a/fse.h b/fse.h index dd5fc44..714bfd3 100644 --- a/fse.h +++ b/fse.h @@ -1,7 +1,7 @@ /* ****************************************************************** * FSE : Finite State Entropy codec * Public Prototypes declaration - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -336,8 +336,9 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); /* FSE_buildCTable_wksp() : * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). * `wkspSize` must be >= `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)` of `unsigned`. + * See FSE_buildCTable_wksp() for breakdown of workspace usage. */ -#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (maxSymbolValue + 2 + (1ull << (tableLog - 2))) +#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((maxSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* additional 8 bytes for potential table overwrite */) #define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)) size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); @@ -352,7 +353,7 @@ size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); /**< build a fake FSE_DTable, designed to always generate the same symbolValue */ -#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)) +#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); /**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ diff --git a/fse_compress.c b/fse_compress.c index 89cb9df..4b40311 100644 --- a/fse_compress.c +++ b/fse_compress.c @@ -1,6 +1,6 @@ /* ****************************************************************** * FSE : Finite State Entropy encoder - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -75,13 +75,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ; FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); U32 const step = FSE_TABLESTEP(tableSize); + U32 const maxSV1 = maxSymbolValue+1; - U32* cumul = (U32*)workSpace; - FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2)); + U16* cumul = (U16*)workSpace; /* size = maxSV1 */ + FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */ U32 highThreshold = tableSize-1; - if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte aligned */ + assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */ if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge); /* CTable header */ tableU16[-2] = (U16) tableLog; @@ -98,20 +99,61 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, /* symbol start positions */ { U32 u; cumul[0] = 0; - for (u=1; u <= maxSymbolValue+1; u++) { + for (u=1; u <= maxSV1; u++) { if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ cumul[u] = cumul[u-1] + 1; tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); } else { - cumul[u] = cumul[u-1] + normalizedCounter[u-1]; + assert(normalizedCounter[u-1] >= 0); + cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1]; + assert(cumul[u] >= cumul[u-1]); /* no overflow */ } } - cumul[maxSymbolValue+1] = tableSize+1; + cumul[maxSV1] = (U16)(tableSize+1); } /* Spread symbols */ - { U32 position = 0; + if (highThreshold == tableSize - 1) { + /* Case for no low prob count symbols. Lay down 8 bytes at a time + * to reduce branch misses since we are operating on a small block + */ + BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */ + { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; + for (s=0; s=0); + pos += (size_t)n; + } + } + /* Spread symbols across the table. Lack of lowprob symbols means that + * we don't need variable sized inner loop, so we can unroll the loop and + * reduce branch misses. + */ + { size_t position = 0; + size_t s; + size_t const unroll = 2; /* Experimentally determined optimal unroll */ + assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ + for (s = 0; s < (size_t)tableSize; s += unroll) { + size_t u; + for (u = 0; u < unroll; ++u) { + size_t const uPosition = (position + (u * step)) & tableMask; + tableSymbol[uPosition] = spread[s + u]; + } + position = (position + (unroll * step)) & tableMask; + } + assert(position == 0); /* Must have initialized all positions */ + } + } else { + U32 position = 0; U32 symbol; - for (symbol=0; symbol<=maxSymbolValue; symbol++) { + for (symbol=0; symbol highThreshold) position = (position + step) & tableMask; /* Low proba area */ } } - assert(position==0); /* Must have initialized all positions */ } @@ -144,16 +185,17 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, case -1: case 1: symbolTT[s].deltaNbBits = (tableLog << 16) - (1< 1); + { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; - symbolTT[s].deltaFindState = total - normalizedCounter[s]; - total += normalizedCounter[s]; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); + total += (unsigned)normalizedCounter[s]; } } } } #if 0 /* debug : symbol costs */ @@ -164,26 +206,16 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, symbol, normalizedCounter[symbol], FSE_getMaxNbBits(symbolTT, symbol), (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256); - } - } + } } #endif return 0; } -#ifndef ZSTD_NO_UNUSED_FUNCTIONS -size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) -{ - FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE]; /* memset() is not necessary, even if static analyzer complain about it */ - return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol)); -} -#endif - #ifndef FSE_COMMONDEFS_ONLY - /*-************************************************************** * FSE NCount encoding ****************************************************************/ diff --git a/fse_decompress.c b/fse_decompress.c index c164430..f4ff58f 100644 --- a/fse_decompress.c +++ b/fse_decompress.c @@ -1,6 +1,6 @@ /* ****************************************************************** * FSE : Finite State Entropy decoder - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -310,6 +310,12 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); } +typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; + FSE_DTable dtable[1]; /* Dynamically sized */ +} FSE_DecompressWksp; + + FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, @@ -318,33 +324,37 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( { const BYTE* const istart = (const BYTE*)cSrc; const BYTE* ip = istart; - short counting[FSE_MAX_SYMBOL_VALUE+1]; unsigned tableLog; unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; - FSE_DTable* const dtable = (FSE_DTable*)workSpace; + FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; + + DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); /* normal FSE decoding mode */ - size_t const NCountLength = FSE_readNCount_bmi2(counting, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); - if (FSE_isError(NCountLength)) return NCountLength; - if (tableLog > maxLog) return ERROR(tableLog_tooLarge); - assert(NCountLength <= cSrcSize); - ip += NCountLength; - cSrcSize -= NCountLength; + { + size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + if (FSE_isError(NCountLength)) return NCountLength; + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); + ip += NCountLength; + cSrcSize -= NCountLength; + } if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); - workSpace = dtable + FSE_DTABLE_SIZE_U32(tableLog); - wkspSize -= FSE_DTABLE_SIZE(tableLog); + workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); - CHECK_F( FSE_buildDTable_internal(dtable, counting, maxSymbolValue, tableLog, workSpace, wkspSize) ); + CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); { - const void* ptr = dtable; + const void* ptr = wksp->dtable; const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; const U32 fastMode = DTableH->fastMode; /* select fast mode (static) */ - if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); - return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); + return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); } } diff --git a/hist.c b/hist.c index 8be9a30..4934aac 100644 --- a/hist.c +++ b/hist.c @@ -1,7 +1,7 @@ /* ****************************************************************** * hist : Histogram functions * part of Finite State Entropy project - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/hist.h b/hist.h index 245be35..f68ff17 100644 --- a/hist.h +++ b/hist.h @@ -1,7 +1,7 @@ /* ****************************************************************** * hist : Histogram functions * part of Finite State Entropy project - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/huf.h b/huf.h index 1afef90..a3ae06e 100644 --- a/huf.h +++ b/huf.h @@ -1,7 +1,7 @@ /* ****************************************************************** * huff0 huffman codec, * part of Finite State Entropy library - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -89,9 +89,9 @@ HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, /** HUF_compress4X_wksp() : * Same as HUF_compress2(), but uses externally allocated `workSpace`. - * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */ -#define HUF_WORKSPACE_SIZE ((6 << 10) + 256) -#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) + * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ +#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) +#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, @@ -136,15 +136,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, /* static allocation of HUF's Compression Table */ /* this is a private definition, just exposed for allocation and strict aliasing purpose. never EVER access its members directly */ -struct HUF_CElt_s { - U16 val; - BYTE nbBits; -}; /* typedef'd to HUF_CElt */ -typedef struct HUF_CElt_s HUF_CElt; /* consider it an incomplete type */ -#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */ -#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32)) +typedef size_t HUF_CElt; /* consider it an incomplete type */ +#define HUF_CTABLE_SIZE_ST(maxSymbolValue) ((maxSymbolValue)+2) /* Use tables of size_t, for proper alignment */ +#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_ST(maxSymbolValue) * sizeof(size_t)) #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ - HUF_CElt name[HUF_CTABLE_SIZE_U32(maxSymbolValue)] /* no final ; */ + HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */ /* static allocation of HUF's DTable */ typedef U32 HUF_DTable; @@ -192,7 +188,9 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); +size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); @@ -205,12 +203,13 @@ typedef enum { * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. - * If preferRepeat then the old table will always be used if valid. */ + * If preferRepeat then the old table will always be used if valid. + * If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. @@ -248,11 +247,10 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, * Loading a CTable saved with HUF_writeCTable() */ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights); -/** HUF_getNbBits() : +/** HUF_getNbBitsFromCTable() : * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX - * Note 1 : is not inlined, as HUF_CElt definition is private - * Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */ -U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue); + * Note 1 : is not inlined, as HUF_CElt definition is private */ +U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); /* * HUF_decompress() does the following: @@ -278,7 +276,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); * a required workspace size greater than that specified in the following * macro. */ -#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10) +#define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) #ifndef HUF_FORCE_DECOMPRESS_X2 @@ -304,18 +302,20 @@ size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* c /* ====================== */ size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); -size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ +size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); /** HUF_compress1X_repeat() : * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. - * If preferRepeat then the old table will always be used if valid. */ + * If preferRepeat then the old table will always be used if valid. + * If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ #ifndef HUF_FORCE_DECOMPRESS_X1 diff --git a/huf_compress.c b/huf_compress.c index fed76ca..5614e21 100644 --- a/huf_compress.c +++ b/huf_compress.c @@ -1,6 +1,6 @@ /* ****************************************************************** * Huffman encoder, part of New Generation Entropy library - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -53,13 +53,43 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS /* ******************************************************* * HUF : Huffman block compression *********************************************************/ +#define HUF_WORKSPACE_MAX_ALIGNMENT 8 + +static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align) +{ + size_t const mask = align - 1; + size_t const rem = (size_t)workspace & mask; + size_t const add = (align - rem) & mask; + BYTE* const aligned = (BYTE*)workspace + add; + assert((align & (align - 1)) == 0); /* pow 2 */ + assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT); + if (*workspaceSizePtr >= add) { + assert(add < align); + assert(((size_t)aligned & mask) == 0); + *workspaceSizePtr -= add; + return aligned; + } else { + *workspaceSizePtr = 0; + return NULL; + } +} + + /* HUF_compressWeights() : * Same as FSE_compress(), but dedicated to huff0's weights compression. * The use case needs much less stack memory. * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX. */ #define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6 -static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize) + +typedef struct { + FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; + U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)]; + unsigned count[HUF_TABLELOG_MAX+1]; + S16 norm[HUF_TABLELOG_MAX+1]; +} HUF_CompressWeightsWksp; + +static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize) { BYTE* const ostart = (BYTE*) dst; BYTE* op = ostart; @@ -67,33 +97,30 @@ static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weight unsigned maxSymbolValue = HUF_TABLELOG_MAX; U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; + HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, sizeof(U32)); - FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; - U32 scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(HUF_TABLELOG_MAX, MAX_FSE_TABLELOG_FOR_HUFF_HEADER)]; - - unsigned count[HUF_TABLELOG_MAX+1]; - S16 norm[HUF_TABLELOG_MAX+1]; + if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC); /* init conditions */ if (wtSize <= 1) return 0; /* Not compressible */ /* Scan input and build symbol stats */ - { unsigned const maxCount = HIST_count_simple(count, &maxSymbolValue, weightTable, wtSize); /* never fails */ + { unsigned const maxCount = HIST_count_simple(wksp->count, &maxSymbolValue, weightTable, wtSize); /* never fails */ if (maxCount == wtSize) return 1; /* only a single symbol in src : rle */ if (maxCount == 1) return 0; /* each symbol present maximum once => not compressible */ } tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue); - CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) ); + CHECK_F( FSE_normalizeCount(wksp->norm, tableLog, wksp->count, wtSize, maxSymbolValue, /* useLowProbCount */ 0) ); /* Write table description header */ - { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), norm, maxSymbolValue, tableLog) ); + { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), wksp->norm, maxSymbolValue, tableLog) ); op += hSize; } /* Compress */ - CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) ); - { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, CTable) ); + CHECK_F( FSE_buildCTable_wksp(wksp->CTable, wksp->norm, maxSymbolValue, tableLog, wksp->scratchBuffer, sizeof(wksp->scratchBuffer)) ); + { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, wksp->CTable) ); if (cSize == 0) return 0; /* not enough space for compressed data */ op += cSize; } @@ -101,30 +128,70 @@ static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weight return (size_t)(op-ostart); } +static size_t HUF_getNbBits(HUF_CElt elt) +{ + return elt & 0xFF; +} -/*! HUF_writeCTable() : - `CTable` : Huffman tree to save, using huf representation. - @return : size of saved CTable */ -size_t HUF_writeCTable (void* dst, size_t maxDstSize, - const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +static size_t HUF_getNbBitsFast(HUF_CElt elt) { + return elt; +} + +static size_t HUF_getValue(HUF_CElt elt) +{ + return elt & ~0xFF; +} + +static size_t HUF_getValueFast(HUF_CElt elt) +{ + return elt; +} + +static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) +{ + assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX); + *elt = nbBits; +} + +static void HUF_setValue(HUF_CElt* elt, size_t value) +{ + size_t const nbBits = HUF_getNbBits(*elt); + if (nbBits > 0) { + assert((value >> nbBits) == 0); + *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits); + } +} + +typedef struct { + HUF_CompressWeightsWksp wksp; BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */ BYTE huffWeight[HUF_SYMBOLVALUE_MAX]; +} HUF_WriteCTableWksp; + +size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, + void* workspace, size_t workspaceSize) +{ + HUF_CElt const* const ct = CTable + 1; BYTE* op = (BYTE*)dst; U32 n; + HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, sizeof(U32)); - /* check conditions */ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); /* convert to weight */ - bitsToWeight[0] = 0; + wksp->bitsToWeight[0] = 0; for (n=1; nbitsToWeight[n] = (BYTE)(huffLog + 1 - n); for (n=0; nhuffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])]; /* attempt weights compression by FSE */ - { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) ); + if (maxDstSize < 1) return ERROR(dstSize_tooSmall); + { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) ); if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ op[0] = (BYTE)hSize; return hSize+1; @@ -134,12 +201,22 @@ size_t HUF_writeCTable (void* dst, size_t maxDstSize, if (maxSymbolValue > (256-128)) return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */ if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */ op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1)); - huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ + wksp->huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ for (n=0; nhuffWeight[n] << 4) + wksp->huffWeight[n+1]); return ((maxSymbolValue+1)/2) + 1; } +/*! HUF_writeCTable() : + `CTable` : Huffman tree to save, using huf representation. + @return : size of saved CTable */ +size_t HUF_writeCTable (void* dst, size_t maxDstSize, + const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +{ + HUF_WriteCTableWksp wksp; + return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +} + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) { @@ -147,6 +224,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */ U32 tableLog = 0; U32 nbSymbols = 0; + HUF_CElt* const ct = CTable + 1; /* get symbol weights */ CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize)); @@ -156,6 +234,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + CTable[0] = tableLog; + /* Prepare base value per rank */ { U32 n, nextRankStart = 0; for (n=1; n<=tableLog; n++) { @@ -167,13 +247,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void /* fill nbBits */ { U32 n; for (n=0; nn=tableLog+1 */ U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; - { U32 n; for (n=0; n>= 1; } } /* assign value within rank, symbol order */ - { U32 n; for (n=0; n huffNode[i-1].count) { + return 0; + } + } + return 1; +} + +/* Insertion sort by descending order */ +HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) { + int i; + int const size = high-low+1; + huffNode += low; + for (i = 1; i < size; ++i) { + nodeElt const key = huffNode[i]; + int j = i - 1; + while (j >= 0 && huffNode[j].count < key.count) { + huffNode[j + 1] = huffNode[j]; + j--; + } + huffNode[j + 1] = key; + } +} + +/* Pivot helper function for quicksort. */ +static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) { + /* Simply select rightmost element as pivot. "Better" selectors like + * median-of-three don't experimentally appear to have any benefit. + */ + U32 const pivot = arr[high].count; + int i = low - 1; + int j = low; + for ( ; j < high; j++) { + if (arr[j].count > pivot) { + i++; + HUF_swapNodes(&arr[i], &arr[j]); + } + } + HUF_swapNodes(&arr[i + 1], &arr[high]); + return i + 1; +} + +/* Classic quicksort by descending with partially iterative calls + * to reduce worst case callstack size. + */ +static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { + int const kInsertionSortThreshold = 8; + if (high - low < kInsertionSortThreshold) { + HUF_insertionSort(arr, low, high); + return; + } + while (low < high) { + int const idx = HUF_quickSortPartition(arr, low, high); + if (idx - low < high - idx) { + HUF_simpleQuickSort(arr, low, idx - 1); + low = idx + 1; + } else { + HUF_simpleQuickSort(arr, idx + 1, high); + high = idx - 1; + } + } +} + /** * HUF_sort(): * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order. + * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket. * * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled. * Must have (maxSymbolValue + 1) entries. @@ -371,44 +551,52 @@ typedef struct { * @param[in] maxSymbolValue Maximum symbol value. * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries. */ -static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition) -{ - int n; - int const maxSymbolValue1 = (int)maxSymbolValue + 1; +static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) { + U32 n; + U32 const maxSymbolValue1 = maxSymbolValue+1; /* Compute base and set curr to base. - * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1. - * Then 2^lowerRank <= count[n]+1 <= 2^rank. + * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1. + * See HUF_getIndex to see bucketing strategy. * We attribute each symbol to lowerRank's base value, because we want to know where * each rank begins in the output, so for rank R we want to count ranks R+1 and above. */ ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); for (n = 0; n < maxSymbolValue1; ++n) { - U32 lowerRank = BIT_highbit32(count[n] + 1); + U32 lowerRank = HUF_getIndex(count[n]); + assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1); rankPosition[lowerRank].base++; } + assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0); + /* Set up the rankPosition table */ for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) { rankPosition[n-1].base += rankPosition[n].base; rankPosition[n-1].curr = rankPosition[n-1].base; } - /* Sort */ + + /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */ for (n = 0; n < maxSymbolValue1; ++n) { U32 const c = count[n]; - U32 const r = BIT_highbit32(c+1) + 1; - U32 pos = rankPosition[r].curr++; - /* Insert into the correct position in the rank. - * We have at most 256 symbols, so this insertion should be fine. - */ - while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) { - huffNode[pos] = huffNode[pos-1]; - pos--; - } + U32 const r = HUF_getIndex(c) + 1; + U32 const pos = rankPosition[r].curr++; + assert(pos < maxSymbolValue1); huffNode[pos].count = c; huffNode[pos].byte = (BYTE)n; } -} + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { + U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); + HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1); + } + } + + assert(HUF_isSorted(huffNode, maxSymbolValue1)); +} /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. @@ -471,6 +659,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) */ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) { + HUF_CElt* const ct = CTable + 1; /* fill result into ctable (val, nbBits) */ int n; U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; @@ -486,20 +675,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i min >>= 1; } } for (n=0; nhuffNodeTbl; nodeElt* const huffNode = huffNode0+1; int nonNullRank; /* safety checks */ - if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) return ERROR(workSpace_tooSmall); if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; @@ -517,91 +706,327 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ - HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits); + HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits); return maxNbBits; } size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + HUF_CElt const* ct = CTable + 1; size_t nbBits = 0; int s; for (s = 0; s <= (int)maxSymbolValue; ++s) { - nbBits += CTable[s].nbBits * count[s]; + nbBits += HUF_getNbBits(ct[s]) * count[s]; } return nbBits >> 3; } int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + HUF_CElt const* ct = CTable + 1; int bad = 0; int s; for (s = 0; s <= (int)maxSymbolValue; ++s) { - bad |= (count[s] != 0) & (CTable[s].nbBits == 0); + bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); } return !bad; } size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +/** HUF_CStream_t: + * Huffman uses its own BIT_CStream_t implementation. + * There are three major differences from BIT_CStream_t: + * 1. HUF_addBits() takes a HUF_CElt (size_t) which is + * the pair (nbBits, value) in the format: + * format: + * - Bits [0, 4) = nbBits + * - Bits [4, 64 - nbBits) = 0 + * - Bits [64 - nbBits, 64) = value + * 2. The bitContainer is built from the upper bits and + * right shifted. E.g. to add a new value of N bits + * you right shift the bitContainer by N, then or in + * the new value into the N upper bits. + * 3. The bitstream has two bit containers. You can add + * bits to the second container and merge them into + * the first container. + */ + +#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) + +typedef struct { + size_t bitContainer[2]; + size_t bitPos[2]; + + BYTE* startPtr; + BYTE* ptr; + BYTE* endPtr; +} HUF_CStream_t; + +/**! HUF_initCStream(): + * Initializes the bistream. + * @returns 0 or an error code. + */ +static size_t HUF_initCStream(HUF_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + ZSTD_memset(bitC, 0, sizeof(*bitC)); + bitC->startPtr = (BYTE*)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]); + if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall); + return 0; +} + +/*! HUF_addBits(): + * Adds the symbol stored in HUF_CElt elt to the bitstream. + * + * @param elt The element we're adding. This is a (nbBits, value) pair. + * See the HUF_CStream_t docs for the format. + * @param idx Insert into the bistream at this idx. + * @param kFast This is a template parameter. If the bitstream is guaranteed + * to have at least 4 unused bits after this call it may be 1, + * otherwise it must be 0. HUF_addBits() is faster when fast is set. + */ +FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast) +{ + assert(idx <= 1); + assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX); + /* This is efficient on x86-64 with BMI2 because shrx + * only reads the low 6 bits of the register. The compiler + * knows this and elides the mask. When fast is set, + * every operation can use the same value loaded from elt. + */ + bitC->bitContainer[idx] >>= HUF_getNbBits(elt); + bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt); + /* We only read the low 8 bits of bitC->bitPos[idx] so it + * doesn't matter that the high bits have noise from the value. + */ + bitC->bitPos[idx] += HUF_getNbBitsFast(elt); + assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + /* The last 4-bits of elt are dirty if fast is set, + * so we must not be overwriting bits that have already been + * inserted into the bit container. + */ +#if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); + size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); + /* We didn't overwrite any bits in the bit container. */ + assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER); + } +#endif +} + +FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) +{ + bitC->bitContainer[1] = 0; + bitC->bitPos[1] = 0; +} + +/*! HUF_mergeIndex1() : + * Merges the bit container @ index 1 into the bit container @ index 0 + * and zeros the bit container @ index 1. + */ +FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) +{ + assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); + bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF); + bitC->bitContainer[0] |= bitC->bitContainer[1]; + bitC->bitPos[0] += bitC->bitPos[1]; + assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER); +} + +/*! HUF_flushBits() : +* Flushes the bits in the bit container @ index 0. +* +* @post bitPos will be < 8. +* @param kFast If kFast is set then we must know a-priori that +* the bit container will not overflow. +*/ +FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) +{ + /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ + size_t const nbBits = bitC->bitPos[0] & 0xFF; + size_t const nbBytes = nbBits >> 3; + /* The top nbBits bits of bitContainer are the ones we need. */ + size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits); + /* Mask bitPos to account for the bytes we consumed. */ + bitC->bitPos[0] &= 7; + assert(nbBits > 0); + assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitContainer); + bitC->ptr += nbBytes; + assert(!kFast || bitC->ptr <= bitC->endPtr); + if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; + /* bitContainer doesn't need to be modified because the leftover + * bits are already the top bitPos bits. And we don't care about + * noise in the lower values. + */ +} + +/*! HUF_endMark() + * @returns The Huffman stream end mark: A 1-bit value = 1. + */ +static HUF_CElt HUF_endMark(void) +{ + HUF_CElt endMark; + HUF_setNbBits(&endMark, 1); + HUF_setValue(&endMark, 1); + return endMark; +} + +/*! HUF_closeCStream() : + * @return Size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +static size_t HUF_closeCStream(HUF_CStream_t* bitC) +{ + HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); + HUF_flushBits(bitC, /* kFast */ 0); + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ + return (bitC->ptr - bitC->startPtr) + (nbBits > 0); + } +} + FORCE_INLINE_TEMPLATE void -HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast) { - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); + HUF_addBits(bitCPtr, CTable[symbol], idx, fast); } -#define HUF_FLUSHBITS(s) BIT_flushBits(s) +FORCE_INLINE_TEMPLATE void +HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, + const BYTE* ip, size_t srcSize, + const HUF_CElt* ct, + int kUnroll, int kFastFlush, int kLastFast) +{ + /* Join to kUnroll */ + int n = (int)srcSize; + int rem = n % kUnroll; + if (rem > 0) { + for (; rem > 0; --rem) { + HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); + } + HUF_flushBits(bitC, kFastFlush); + } + assert(n % kUnroll == 0); -#define HUF_FLUSHBITS_1(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) + /* Join to 2 * kUnroll */ + if (n % (2 * kUnroll)) { + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); + HUF_flushBits(bitC, kFastFlush); + n -= kUnroll; + } + assert(n % (2 * kUnroll) == 0); + + for (; n>0; n-= 2 * kUnroll) { + /* Encode kUnroll symbols into the bitstream @ index 0. */ + int u; + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast); + HUF_flushBits(bitC, kFastFlush); + /* Encode kUnroll symbols into the bitstream @ index 1. + * This allows us to start filling the bit container + * without any data dependencies. + */ + HUF_zeroIndex1(bitC); + for (u = 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast); + /* Merge bitstream @ index 1 into the bitstream @ index 0 */ + HUF_mergeIndex1(bitC); + HUF_flushBits(bitC, kFastFlush); + } + assert(n == 0); + +} + +/** + * Returns a tight upper bound on the output space needed by Huffman + * with 8 bytes buffer to handle over-writes. If the output is at least + * this large we don't need to do bounds checks during Huffman encoding. + */ +static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) +{ + return ((srcSize * tableLog) >> 3) + 8; +} -#define HUF_FLUSHBITS_2(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) FORCE_INLINE_TEMPLATE size_t HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { + U32 const tableLog = (U32)CTable[0]; + HUF_CElt const* ct = CTable + 1; const BYTE* ip = (const BYTE*) src; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; - size_t n; - BIT_CStream_t bitC; + HUF_CStream_t bitC; /* init */ if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op)); + { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); if (HUF_isError(initErr)) return 0; } - n = srcSize & ~3; /* join to mod 4 */ - switch (srcSize & 3) - { - case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); - HUF_FLUSHBITS_2(&bitC); - /* fall-through */ - case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); - HUF_FLUSHBITS_1(&bitC); - /* fall-through */ - case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); - HUF_FLUSHBITS(&bitC); - /* fall-through */ - case 0 : /* fall-through */ - default: break; - } - - for (; n>0; n-=4) { /* note : n&3==0 at this stage */ - HUF_encodeSymbol(&bitC, ip[n- 1], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 2], CTable); - HUF_FLUSHBITS_2(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 3], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 4], CTable); - HUF_FLUSHBITS(&bitC); + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); + else { + if (MEM_32bits()) { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: + case 9: + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 7: + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } else { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 9: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 7: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 6: + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } } + assert(bitC.ptr <= bitC.endPtr); - return BIT_closeCStream(&bitC); + return HUF_closeCStream(&bitC); } #if DYNAMIC_BMI2 @@ -648,9 +1073,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { - return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); } +size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); +} static size_t HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, @@ -670,8 +1099,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, assert(op <= oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart, (U16)cSize); op += cSize; } @@ -679,8 +1107,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, ip += segmentSize; assert(op <= oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+2, (U16)cSize); op += cSize; } @@ -688,8 +1115,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, ip += segmentSize; assert(op <= oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize==0) return 0; - assert(cSize <= 65535); + if (cSize == 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+4, (U16)cSize); op += cSize; } @@ -698,7 +1124,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, assert(op <= oend); assert(ip <= iend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); - if (cSize==0) return 0; + if (cSize == 0 || cSize > 65535) return 0; op += cSize; } @@ -707,7 +1133,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { - return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); + return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + +size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); } typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; @@ -731,32 +1162,38 @@ static size_t HUF_compressCTable_internal( typedef struct { unsigned count[HUF_SYMBOLVALUE_MAX + 1]; - HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; - HUF_buildCTable_wksp_tables buildCTable_wksp; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)]; + union { + HUF_buildCTable_wksp_tables buildCTable_wksp; + HUF_WriteCTableWksp writeCTable_wksp; + U32 hist_wksp[HIST_WKSP_SIZE_U32]; + } wksps; } HUF_compress_tables_t; +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + /* HUF_compress_internal() : * `workSpace_align4` must be aligned on 4-bytes boundaries, - * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */ + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ static size_t HUF_compress_internal (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, HUF_nbStreams_e nbStreams, - void* workSpace_align4, size_t wkspSize, + void* workSpace, size_t wkspSize, HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, - const int bmi2) + const int bmi2, unsigned suspectUncompressible) { - HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4; + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, sizeof(size_t)); BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; BYTE* op = ostart; - HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE); - assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */ + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); /* checks & inits */ - if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); if (!srcSize) return 0; /* Uncompressed */ if (!dstSize) return 0; /* cannot fit anything within dst budget */ if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ @@ -772,8 +1209,23 @@ HUF_compress_internal (void* dst, size_t dstSize, nbStreams, oldHufTable, bmi2); } + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); + if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; + } + { unsigned maxSymbolValueEnd = maxSymbolValue; + CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestEnd; + } + if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } + /* Scan input and build symbol stats */ - { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) ); + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) ); if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ } @@ -795,16 +1247,20 @@ HUF_compress_internal (void* dst, size_t dstSize, huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, maxSymbolValue, huffLog, - &table->buildCTable_wksp, sizeof(table->buildCTable_wksp)); + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); CHECK_F(maxBits); huffLog = (U32)maxBits; - /* Zero unused symbols in CTable, so we can check it for validity */ - ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0, - sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); + } + /* Zero unused symbols in CTable, so we can check it for validity */ + { + size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); + size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); + ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); } /* Write table description header */ - { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) ); + { CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, table->CTable, maxSymbolValue, huffLog, + &table->wksps.writeCTable_wksp, sizeof(table->wksps.writeCTable_wksp)) ); /* Check if using previous huffman table is beneficial */ if (repeat && *repeat != HUF_repeat_none) { size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue); @@ -836,19 +1292,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize, return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + NULL, NULL, 0, 0 /*bmi2*/, 0); } size_t HUF_compress1X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, + int bmi2, unsigned suspectUncompressible) { return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, hufTable, - repeat, preferRepeat, bmi2); + repeat, preferRepeat, bmi2, suspectUncompressible); } /* HUF_compress4X_repeat(): @@ -862,22 +1319,23 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize, return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + NULL, NULL, 0, 0 /*bmi2*/, 0); } /* HUF_compress4X_repeat(): * compress input using 4 streams. + * consider skipping quickly * re-use an existing huffman compression table */ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) { return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - hufTable, repeat, preferRepeat, bmi2); + hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); } #ifndef ZSTD_NO_UNUSED_FUNCTIONS @@ -895,7 +1353,7 @@ size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog) { - unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; + U64 workSpace[HUF_WORKSPACE_SIZE_U64]; return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); } @@ -903,7 +1361,7 @@ size_t HUF_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog) { - unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; + U64 workSpace[HUF_WORKSPACE_SIZE_U64]; return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); } diff --git a/huf_decompress.c b/huf_decompress.c index 65e157e..eb292e0 100644 --- a/huf_decompress.c +++ b/huf_decompress.c @@ -1,7 +1,7 @@ /* ****************************************************************** * huff0 huffman decoder, * part of Finite State Entropy library - * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy @@ -528,13 +528,15 @@ typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, const U32* rankValOrigin, const int minWeight, const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, - U32 nbBitsBaseline, U16 baseSeq) + U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize) { HUF_DEltX2 DElt; - U32 rankVal[HUF_TABLELOG_MAX + 1]; + U32* rankVal = wksp; + assert(wkspSize >= HUF_TABLELOG_MAX + 1); + (void)wkspSize; /* get pre-calculated rankVal */ - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); /* fill skipped values */ if (minWeight>1) { @@ -569,14 +571,18 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 co static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, const sortedSymbol_t* sortedList, const U32 sortedListSize, const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, - const U32 nbBitsBaseline) + const U32 nbBitsBaseline, U32* wksp, size_t wkspSize) { - U32 rankVal[HUF_TABLELOG_MAX + 1]; + U32* rankVal = wksp; const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ const U32 minBits = nbBitsBaseline - maxWeight; U32 s; - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + assert(wkspSize >= HUF_TABLELOG_MAX + 1); + wksp += HUF_TABLELOG_MAX + 1; + wkspSize -= HUF_TABLELOG_MAX + 1; + + ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); /* fill DTable */ for (s=0; s> 2; - rankStats = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_MAX + 1; - rankStart0 = (U32 *)workSpace + spaceUsed32; - spaceUsed32 += HUF_TABLELOG_MAX + 2; - sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t); - spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2; - weightList = (BYTE *)((U32 *)workSpace + spaceUsed32); - spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; - - if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); - - rankStart = rankStart0 + 1; - ZSTD_memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1)); + HUF_ReadDTableX2_Workspace* const wksp = (HUF_ReadDTableX2_Workspace*)workSpace; + + if (sizeof(*wksp) > wkspSize) return ERROR(GENERIC); + + rankStart = wksp->rankStart0 + 1; + ZSTD_memset(wksp->rankStats, 0, sizeof(wksp->rankStats)); + ZSTD_memset(wksp->rankStart0, 0, sizeof(wksp->rankStart0)); DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ - iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); + iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), /* bmi2 */ 0); if (HUF_isError(iSize)) return iSize; /* check result */ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ /* find maxWeight */ - for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ + for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ /* Get start index of each weight */ { U32 w, nextRankStart = 0; for (w=1; wrankStats[w]; rankStart[w] = curr; } rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ @@ -670,37 +670,38 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, /* sort symbols by weight */ { U32 s; for (s=0; sweightList[s]; U32 const r = rankStart[w]++; - sortedSymbol[r].symbol = (BYTE)s; - sortedSymbol[r].weight = (BYTE)w; + wksp->sortedSymbol[r].symbol = (BYTE)s; + wksp->sortedSymbol[r].weight = (BYTE)w; } rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ } /* Build rankVal */ - { U32* const rankVal0 = rankVal[0]; + { U32* const rankVal0 = wksp->rankVal[0]; { int const rescale = (maxTableLog-tableLog) - 1; /* tableLog <= maxTableLog */ U32 nextRankVal = 0; U32 w; for (w=1; wrankStats[w] << (w+rescale); rankVal0[w] = curr; } } { U32 const minBits = tableLog+1 - maxW; U32 consumed; for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) { - U32* const rankValPtr = rankVal[consumed]; + U32* const rankValPtr = wksp->rankVal[consumed]; U32 w; for (w = 1; w < maxW+1; w++) { rankValPtr[w] = rankVal0[w] >> consumed; } } } } HUF_fillDTableX2(dt, maxTableLog, - sortedSymbol, sizeOfSort, - rankStart0, rankVal, maxW, - tableLog+1); + wksp->sortedSymbol, sizeOfSort, + wksp->rankStart0, wksp->rankVal, maxW, + tableLog+1, + wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32)); dtd.tableLog = (BYTE)maxTableLog; dtd.tableType = 1; diff --git a/mem.h b/mem.h index 4728ef7..1c61b7e 100644 --- a/mem.h +++ b/mem.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -143,9 +143,7 @@ MEM_STATIC size_t MEM_swapST(size_t in); * Prefer these methods in priority order (0 > 1 > 2) */ #ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define MEM_FORCE_MEMORY_ACCESS 2 -# elif defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__) +# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__) # define MEM_FORCE_MEMORY_ACCESS 1 # endif #endif @@ -155,8 +153,22 @@ MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; } MEM_STATIC unsigned MEM_isLittleEndian(void) { +#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + return 1; +#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return 0; +#elif defined(__clang__) && __LITTLE_ENDIAN__ + return 1; +#elif defined(__clang__) && __BIG_ENDIAN__ + return 0; +#elif defined(_MSC_VER) && (_M_AMD64 || _M_IX86) + return 1; +#elif defined(__DMC__) && defined(_M_IX86) + return 1; +#else const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ return one.c[0]; +#endif } #if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) @@ -308,7 +320,7 @@ MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val) MEM_STATIC U32 MEM_readLE24(const void* memPtr) { - return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16); + return (U32)MEM_readLE16(memPtr) + ((U32)(((const BYTE*)memPtr)[2]) << 16); } MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val) diff --git a/pool.c b/pool.c index 4c1b833..ea70b8b 100644 --- a/pool.c +++ b/pool.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/pool.h b/pool.h index 816e25f..411e318 100644 --- a/pool.h +++ b/pool.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/xxhash.c b/xxhash.c index e708df3..926b336 100644 --- a/xxhash.c +++ b/xxhash.c @@ -1,6 +1,6 @@ /* * xxHash - Fast Hash algorithm - * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - xxHash homepage: http://www.xxhash.com @@ -30,9 +30,7 @@ * Prefer these methods in priority order (0 > 1 > 2) */ #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define XXH_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ +# if (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \ defined(__ICCARM__) # define XXH_FORCE_MEMORY_ACCESS 1 diff --git a/xxhash.h b/xxhash.h index eceb55d..16c1f16 100644 --- a/xxhash.h +++ b/xxhash.h @@ -1,7 +1,7 @@ /* * xxHash - Extremely Fast Hash algorithm * Header File - * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * * You can contact the author at : * - xxHash source repository : https://github.com/Cyan4973/xxHash diff --git a/zbuff.h b/zbuff.h deleted file mode 100644 index 7686f76..0000000 --- a/zbuff.h +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - -/* *************************************************************** -* NOTES/WARNINGS -******************************************************************/ -/* The streaming API defined here is deprecated. - * Consider migrating towards ZSTD_compressStream() API in `zstd.h` - * See 'lib/README.md'. - *****************************************************************/ - - -#if defined (__cplusplus) -extern "C" { -#endif - -#ifndef ZSTD_BUFFERED_H_23987 -#define ZSTD_BUFFERED_H_23987 - -/* ************************************* -* Dependencies -***************************************/ -#include /* size_t */ -#include "zstd.h" /* ZSTD_CStream, ZSTD_DStream, ZSTDLIB_API */ - - -/* *************************************************************** -* Compiler specifics -*****************************************************************/ -/* Deprecation warnings */ -/* Should these warnings be a problem, - * it is generally possible to disable them, - * typically with -Wno-deprecated-declarations for gcc - * or _CRT_SECURE_NO_WARNINGS in Visual. - * Otherwise, it's also possible to define ZBUFF_DISABLE_DEPRECATE_WARNINGS - */ -#ifdef ZBUFF_DISABLE_DEPRECATE_WARNINGS -# define ZBUFF_DEPRECATED(message) ZSTDLIB_API /* disable deprecation warnings */ -#else -# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ -# define ZBUFF_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API -# elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) -# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message))) -# elif defined(__GNUC__) && (__GNUC__ >= 3) -# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated)) -# elif defined(_MSC_VER) -# define ZBUFF_DEPRECATED(message) ZSTDLIB_API __declspec(deprecated(message)) -# else -# pragma message("WARNING: You need to implement ZBUFF_DEPRECATED for this compiler") -# define ZBUFF_DEPRECATED(message) ZSTDLIB_API -# endif -#endif /* ZBUFF_DISABLE_DEPRECATE_WARNINGS */ - - -/* ************************************* -* Streaming functions -***************************************/ -/* This is the easier "buffered" streaming API, -* using an internal buffer to lift all restrictions on user-provided buffers -* which can be any size, any place, for both input and output. -* ZBUFF and ZSTD are 100% interoperable, -* frames created by one can be decoded by the other one */ - -typedef ZSTD_CStream ZBUFF_CCtx; -ZBUFF_DEPRECATED("use ZSTD_createCStream") ZBUFF_CCtx* ZBUFF_createCCtx(void); -ZBUFF_DEPRECATED("use ZSTD_freeCStream") size_t ZBUFF_freeCCtx(ZBUFF_CCtx* cctx); - -ZBUFF_DEPRECATED("use ZSTD_initCStream") size_t ZBUFF_compressInit(ZBUFF_CCtx* cctx, int compressionLevel); -ZBUFF_DEPRECATED("use ZSTD_initCStream_usingDict") size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); - -ZBUFF_DEPRECATED("use ZSTD_compressStream") size_t ZBUFF_compressContinue(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr, const void* src, size_t* srcSizePtr); -ZBUFF_DEPRECATED("use ZSTD_flushStream") size_t ZBUFF_compressFlush(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr); -ZBUFF_DEPRECATED("use ZSTD_endStream") size_t ZBUFF_compressEnd(ZBUFF_CCtx* cctx, void* dst, size_t* dstCapacityPtr); - -/*-************************************************* -* Streaming compression - howto -* -* A ZBUFF_CCtx object is required to track streaming operation. -* Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources. -* ZBUFF_CCtx objects can be reused multiple times. -* -* Start by initializing ZBUF_CCtx. -* Use ZBUFF_compressInit() to start a new compression operation. -* Use ZBUFF_compressInitDictionary() for a compression which requires a dictionary. -* -* Use ZBUFF_compressContinue() repetitively to consume input stream. -* *srcSizePtr and *dstCapacityPtr can be any size. -* The function will report how many bytes were read or written within *srcSizePtr and *dstCapacityPtr. -* Note that it may not consume the entire input, in which case it's up to the caller to present again remaining data. -* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each call, so save its content if it matters or change @dst . -* @return : a hint to preferred nb of bytes to use as input for next function call (it's just a hint, to improve latency) -* or an error code, which can be tested using ZBUFF_isError(). -* -* At any moment, it's possible to flush whatever data remains within buffer, using ZBUFF_compressFlush(). -* The nb of bytes written into `dst` will be reported into *dstCapacityPtr. -* Note that the function cannot output more than *dstCapacityPtr, -* therefore, some content might still be left into internal buffer if *dstCapacityPtr is too small. -* @return : nb of bytes still present into internal buffer (0 if it's empty) -* or an error code, which can be tested using ZBUFF_isError(). -* -* ZBUFF_compressEnd() instructs to finish a frame. -* It will perform a flush and write frame epilogue. -* The epilogue is required for decoders to consider a frame completed. -* Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small. -* In which case, call again ZBUFF_compressFlush() to complete the flush. -* @return : nb of bytes still present into internal buffer (0 if it's empty) -* or an error code, which can be tested using ZBUFF_isError(). -* -* Hint : _recommended buffer_ sizes (not compulsory) : ZBUFF_recommendedCInSize() / ZBUFF_recommendedCOutSize() -* input : ZBUFF_recommendedCInSize==128 KB block size is the internal unit, use this value to reduce intermediate stages (better latency) -* output : ZBUFF_recommendedCOutSize==ZSTD_compressBound(128 KB) + 3 + 3 : ensures it's always possible to write/flush/end a full block. Skip some buffering. -* By using both, it ensures that input will be entirely consumed, and output will always contain the result, reducing intermediate buffering. -* **************************************************/ - - -typedef ZSTD_DStream ZBUFF_DCtx; -ZBUFF_DEPRECATED("use ZSTD_createDStream") ZBUFF_DCtx* ZBUFF_createDCtx(void); -ZBUFF_DEPRECATED("use ZSTD_freeDStream") size_t ZBUFF_freeDCtx(ZBUFF_DCtx* dctx); - -ZBUFF_DEPRECATED("use ZSTD_initDStream") size_t ZBUFF_decompressInit(ZBUFF_DCtx* dctx); -ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* dctx, const void* dict, size_t dictSize); - -ZBUFF_DEPRECATED("use ZSTD_decompressStream") size_t ZBUFF_decompressContinue(ZBUFF_DCtx* dctx, - void* dst, size_t* dstCapacityPtr, - const void* src, size_t* srcSizePtr); - -/*-*************************************************************************** -* Streaming decompression howto -* -* A ZBUFF_DCtx object is required to track streaming operations. -* Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources. -* Use ZBUFF_decompressInit() to start a new decompression operation, -* or ZBUFF_decompressInitDictionary() if decompression requires a dictionary. -* Note that ZBUFF_DCtx objects can be re-init multiple times. -* -* Use ZBUFF_decompressContinue() repetitively to consume your input. -* *srcSizePtr and *dstCapacityPtr can be any size. -* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. -* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. -* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`. -* @return : 0 when a frame is completely decoded and fully flushed, -* 1 when there is still some data left within internal buffer to flush, -* >1 when more data is expected, with value being a suggested next input size (it's just a hint, which helps latency), -* or an error code, which can be tested using ZBUFF_isError(). -* -* Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize() and ZBUFF_recommendedDOutSize() -* output : ZBUFF_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. -* input : ZBUFF_recommendedDInSize == 128KB + 3; -* just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . -* *******************************************************************************/ - - -/* ************************************* -* Tool functions -***************************************/ -ZBUFF_DEPRECATED("use ZSTD_isError") unsigned ZBUFF_isError(size_t errorCode); -ZBUFF_DEPRECATED("use ZSTD_getErrorName") const char* ZBUFF_getErrorName(size_t errorCode); - -/** Functions below provide recommended buffer sizes for Compression or Decompression operations. -* These sizes are just hints, they tend to offer better latency */ -ZBUFF_DEPRECATED("use ZSTD_CStreamInSize") size_t ZBUFF_recommendedCInSize(void); -ZBUFF_DEPRECATED("use ZSTD_CStreamOutSize") size_t ZBUFF_recommendedCOutSize(void); -ZBUFF_DEPRECATED("use ZSTD_DStreamInSize") size_t ZBUFF_recommendedDInSize(void); -ZBUFF_DEPRECATED("use ZSTD_DStreamOutSize") size_t ZBUFF_recommendedDOutSize(void); - -#endif /* ZSTD_BUFFERED_H_23987 */ - - -#ifdef ZBUFF_STATIC_LINKING_ONLY -#ifndef ZBUFF_STATIC_H_30298098432 -#define ZBUFF_STATIC_H_30298098432 - -/* ==================================================================================== - * The definitions in this section are considered experimental. - * They should never be used in association with a dynamic library, as they may change in the future. - * They are provided for advanced usages. - * Use them only in association with static linking. - * ==================================================================================== */ - -/*--- Dependency ---*/ -#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters, ZSTD_customMem */ -#include "zstd.h" - - -/*--- Custom memory allocator ---*/ -/*! ZBUFF_createCCtx_advanced() : - * Create a ZBUFF compression context using external alloc and free functions */ -ZBUFF_DEPRECATED("use ZSTD_createCStream_advanced") ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem); - -/*! ZBUFF_createDCtx_advanced() : - * Create a ZBUFF decompression context using external alloc and free functions */ -ZBUFF_DEPRECATED("use ZSTD_createDStream_advanced") ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem); - - -/*--- Advanced Streaming Initialization ---*/ -ZBUFF_DEPRECATED("use ZSTD_initDStream_usingDict") size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, - const void* dict, size_t dictSize, - ZSTD_parameters params, unsigned long long pledgedSrcSize); - - -#endif /* ZBUFF_STATIC_H_30298098432 */ -#endif /* ZBUFF_STATIC_LINKING_ONLY */ - - -#if defined (__cplusplus) -} -#endif diff --git a/zbuff_common.c b/zbuff_common.c deleted file mode 100644 index e2b9613..0000000 --- a/zbuff_common.c +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - -/*-************************************* -* Dependencies -***************************************/ -#include "error_private.h" -#include "zbuff.h" - -/*-**************************************** -* ZBUFF Error Management (deprecated) -******************************************/ - -/*! ZBUFF_isError() : -* tells if a return value is an error code */ -unsigned ZBUFF_isError(size_t errorCode) { return ERR_isError(errorCode); } -/*! ZBUFF_getErrorName() : -* provides error code string from function result (useful for debugging) */ -const char* ZBUFF_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); } diff --git a/zbuff_compress.c b/zbuff_compress.c deleted file mode 100644 index 2d20b13..0000000 --- a/zbuff_compress.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - - - -/* ************************************* -* Dependencies -***************************************/ -#define ZBUFF_STATIC_LINKING_ONLY -#include "zbuff.h" - - -/*-*********************************************************** -* Streaming compression -* -* A ZBUFF_CCtx object is required to track streaming operation. -* Use ZBUFF_createCCtx() and ZBUFF_freeCCtx() to create/release resources. -* Use ZBUFF_compressInit() to start a new compression operation. -* ZBUFF_CCtx objects can be reused multiple times. -* -* Use ZBUFF_compressContinue() repetitively to consume your input. -* *srcSizePtr and *dstCapacityPtr can be any size. -* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. -* Note that it may not consume the entire input, in which case it's up to the caller to call again the function with remaining input. -* The content of dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change dst . -* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency) -* or an error code, which can be tested using ZBUFF_isError(). -* -* ZBUFF_compressFlush() can be used to instruct ZBUFF to compress and output whatever remains within its buffer. -* Note that it will not output more than *dstCapacityPtr. -* Therefore, some content might still be left into its internal buffer if dst buffer is too small. -* @return : nb of bytes still present into internal buffer (0 if it's empty) -* or an error code, which can be tested using ZBUFF_isError(). -* -* ZBUFF_compressEnd() instructs to finish a frame. -* It will perform a flush and write frame epilogue. -* Similar to ZBUFF_compressFlush(), it may not be able to output the entire internal buffer content if *dstCapacityPtr is too small. -* @return : nb of bytes still present into internal buffer (0 if it's empty) -* or an error code, which can be tested using ZBUFF_isError(). -* -* Hint : recommended buffer sizes (not compulsory) -* input : ZSTD_BLOCKSIZE_MAX (128 KB), internal unit size, it improves latency to use this value. -* output : ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + ZBUFF_endFrameSize : ensures it's always possible to write/flush/end a full block at best speed. -* ***********************************************************/ - -ZBUFF_CCtx* ZBUFF_createCCtx(void) -{ - return ZSTD_createCStream(); -} - -ZBUFF_CCtx* ZBUFF_createCCtx_advanced(ZSTD_customMem customMem) -{ - return ZSTD_createCStream_advanced(customMem); -} - -size_t ZBUFF_freeCCtx(ZBUFF_CCtx* zbc) -{ - return ZSTD_freeCStream(zbc); -} - - -/* ====== Initialization ====== */ - -size_t ZBUFF_compressInit_advanced(ZBUFF_CCtx* zbc, - const void* dict, size_t dictSize, - ZSTD_parameters params, unsigned long long pledgedSrcSize) -{ - if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; /* preserve "0 == unknown" behavior */ - return ZSTD_initCStream_advanced(zbc, dict, dictSize, params, pledgedSrcSize); -} - - -size_t ZBUFF_compressInitDictionary(ZBUFF_CCtx* zbc, const void* dict, size_t dictSize, int compressionLevel) -{ - return ZSTD_initCStream_usingDict(zbc, dict, dictSize, compressionLevel); -} - -size_t ZBUFF_compressInit(ZBUFF_CCtx* zbc, int compressionLevel) -{ - return ZSTD_initCStream(zbc, compressionLevel); -} - -/* ====== Compression ====== */ - - -size_t ZBUFF_compressContinue(ZBUFF_CCtx* zbc, - void* dst, size_t* dstCapacityPtr, - const void* src, size_t* srcSizePtr) -{ - size_t result; - ZSTD_outBuffer outBuff; - ZSTD_inBuffer inBuff; - outBuff.dst = dst; - outBuff.pos = 0; - outBuff.size = *dstCapacityPtr; - inBuff.src = src; - inBuff.pos = 0; - inBuff.size = *srcSizePtr; - result = ZSTD_compressStream(zbc, &outBuff, &inBuff); - *dstCapacityPtr = outBuff.pos; - *srcSizePtr = inBuff.pos; - return result; -} - - - -/* ====== Finalize ====== */ - -size_t ZBUFF_compressFlush(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr) -{ - size_t result; - ZSTD_outBuffer outBuff; - outBuff.dst = dst; - outBuff.pos = 0; - outBuff.size = *dstCapacityPtr; - result = ZSTD_flushStream(zbc, &outBuff); - *dstCapacityPtr = outBuff.pos; - return result; -} - - -size_t ZBUFF_compressEnd(ZBUFF_CCtx* zbc, void* dst, size_t* dstCapacityPtr) -{ - size_t result; - ZSTD_outBuffer outBuff; - outBuff.dst = dst; - outBuff.pos = 0; - outBuff.size = *dstCapacityPtr; - result = ZSTD_endStream(zbc, &outBuff); - *dstCapacityPtr = outBuff.pos; - return result; -} - - - -/* ************************************* -* Tool functions -***************************************/ -size_t ZBUFF_recommendedCInSize(void) { return ZSTD_CStreamInSize(); } -size_t ZBUFF_recommendedCOutSize(void) { return ZSTD_CStreamOutSize(); } diff --git a/zbuff_decompress.c b/zbuff_decompress.c deleted file mode 100644 index d3c49e8..0000000 --- a/zbuff_decompress.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - - - -/* ************************************* -* Dependencies -***************************************/ -#define ZBUFF_STATIC_LINKING_ONLY -#include "zbuff.h" - - -ZBUFF_DCtx* ZBUFF_createDCtx(void) -{ - return ZSTD_createDStream(); -} - -ZBUFF_DCtx* ZBUFF_createDCtx_advanced(ZSTD_customMem customMem) -{ - return ZSTD_createDStream_advanced(customMem); -} - -size_t ZBUFF_freeDCtx(ZBUFF_DCtx* zbd) -{ - return ZSTD_freeDStream(zbd); -} - - -/* *** Initialization *** */ - -size_t ZBUFF_decompressInitDictionary(ZBUFF_DCtx* zbd, const void* dict, size_t dictSize) -{ - return ZSTD_initDStream_usingDict(zbd, dict, dictSize); -} - -size_t ZBUFF_decompressInit(ZBUFF_DCtx* zbd) -{ - return ZSTD_initDStream(zbd); -} - - -/* *** Decompression *** */ - -size_t ZBUFF_decompressContinue(ZBUFF_DCtx* zbd, - void* dst, size_t* dstCapacityPtr, - const void* src, size_t* srcSizePtr) -{ - ZSTD_outBuffer outBuff; - ZSTD_inBuffer inBuff; - size_t result; - outBuff.dst = dst; - outBuff.pos = 0; - outBuff.size = *dstCapacityPtr; - inBuff.src = src; - inBuff.pos = 0; - inBuff.size = *srcSizePtr; - result = ZSTD_decompressStream(zbd, &outBuff, &inBuff); - *dstCapacityPtr = outBuff.pos; - *srcSizePtr = inBuff.pos; - return result; -} - - -/* ************************************* -* Tool functions -***************************************/ -size_t ZBUFF_recommendedDInSize(void) { return ZSTD_DStreamInSize(); } -size_t ZBUFF_recommendedDOutSize(void) { return ZSTD_DStreamOutSize(); } diff --git a/zdict.c b/zdict.c index 7ac7c60..4fd6369 100644 --- a/zdict.c +++ b/zdict.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -23,9 +23,13 @@ /* Unix Large Files support (>4GB) */ #define _FILE_OFFSET_BITS 64 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */ +# ifndef _LARGEFILE_SOURCE # define _LARGEFILE_SOURCE +# endif #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */ +# ifndef _LARGEFILE64_SOURCE # define _LARGEFILE64_SOURCE +# endif #endif @@ -37,18 +41,19 @@ #include /* fprintf, fopen, ftello64 */ #include /* clock */ +#ifndef ZDICT_STATIC_LINKING_ONLY +# define ZDICT_STATIC_LINKING_ONLY +#endif +#define HUF_STATIC_LINKING_ONLY + #include "mem.h" /* read */ #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */ -#define HUF_STATIC_LINKING_ONLY #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */ #include "zstd_internal.h" /* includes zstd.h */ #include "xxhash.h" /* XXH64 */ -#include "divsufsort.h" -#ifndef ZDICT_STATIC_LINKING_ONLY -# define ZDICT_STATIC_LINKING_ONLY -#endif -#include "zdict.h" #include "zstd_compress_internal.h" /* ZSTD_loadCEntropy() */ +#include "zdict.h" +#include "divsufsort.h" /*-************************************* @@ -967,16 +972,11 @@ static size_t ZDICT_addEntropyTablesFromBuffer_advanced( return MIN(dictBufferCapacity, hSize+dictContentSize); } -/* Hidden declaration for dbio.c */ -size_t ZDICT_trainFromBuffer_unsafe_legacy( - void* dictBuffer, size_t maxDictSize, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, - ZDICT_legacy_params_t params); /*! ZDICT_trainFromBuffer_unsafe_legacy() : -* Warning : `samplesBuffer` must be followed by noisy guard band. +* Warning : `samplesBuffer` must be followed by noisy guard band !!! * @return : size of dictionary, or an error code which can be tested with ZDICT_isError() */ -size_t ZDICT_trainFromBuffer_unsafe_legacy( +static size_t ZDICT_trainFromBuffer_unsafe_legacy( void* dictBuffer, size_t maxDictSize, const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t params) diff --git a/zdict.h b/zdict.h index b782993..75b05db 100644 --- a/zdict.h +++ b/zdict.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -36,6 +36,145 @@ extern "C" { # define ZDICTLIB_API ZDICTLIB_VISIBILITY #endif +/******************************************************************************* + * Zstd dictionary builder + * + * FAQ + * === + * Why should I use a dictionary? + * ------------------------------ + * + * Zstd can use dictionaries to improve compression ratio of small data. + * Traditionally small files don't compress well because there is very little + * repetion in a single sample, since it is small. But, if you are compressing + * many similar files, like a bunch of JSON records that share the same + * structure, you can train a dictionary on ahead of time on some samples of + * these files. Then, zstd can use the dictionary to find repetitions that are + * present across samples. This can vastly improve compression ratio. + * + * When is a dictionary useful? + * ---------------------------- + * + * Dictionaries are useful when compressing many small files that are similar. + * The larger a file is, the less benefit a dictionary will have. Generally, + * we don't expect dictionary compression to be effective past 100KB. And the + * smaller a file is, the more we would expect the dictionary to help. + * + * How do I use a dictionary? + * -------------------------- + * + * Simply pass the dictionary to the zstd compressor with + * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to + * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other + * more advanced functions that allow selecting some options, see zstd.h for + * complete documentation. + * + * What is a zstd dictionary? + * -------------------------- + * + * A zstd dictionary has two pieces: Its header, and its content. The header + * contains a magic number, the dictionary ID, and entropy tables. These + * entropy tables allow zstd to save on header costs in the compressed file, + * which really matters for small data. The content is just bytes, which are + * repeated content that is common across many samples. + * + * What is a raw content dictionary? + * --------------------------------- + * + * A raw content dictionary is just bytes. It doesn't have a zstd dictionary + * header, a dictionary ID, or entropy tables. Any buffer is a valid raw + * content dictionary. + * + * How do I train a dictionary? + * ---------------------------- + * + * Gather samples from your use case. These samples should be similar to each + * other. If you have several use cases, you could try to train one dictionary + * per use case. + * + * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your + * dictionary. There are a few advanced versions of this function, but this + * is a great starting point. If you want to further tune your dictionary + * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow + * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`. + * + * If the dictionary training function fails, that is likely because you + * either passed too few samples, or a dictionary would not be effective + * for your data. Look at the messages that the dictionary trainer printed, + * if it doesn't say too few samples, then a dictionary would not be effective. + * + * How large should my dictionary be? + * ---------------------------------- + * + * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB. + * The zstd CLI defaults to a 110KB dictionary. You likely don't need a + * dictionary larger than that. But, most use cases can get away with a + * smaller dictionary. The advanced dictionary builders can automatically + * shrink the dictionary for you, and select a the smallest size that + * doesn't hurt compression ratio too much. See the `shrinkDict` parameter. + * A smaller dictionary can save memory, and potentially speed up + * compression. + * + * How many samples should I provide to the dictionary builder? + * ------------------------------------------------------------ + * + * We generally recommend passing ~100x the size of the dictionary + * in samples. A few thousand should suffice. Having too few samples + * can hurt the dictionaries effectiveness. Having more samples will + * only improve the dictionaries effectiveness. But having too many + * samples can slow down the dictionary builder. + * + * How do I determine if a dictionary will be effective? + * ----------------------------------------------------- + * + * Simply train a dictionary and try it out. You can use zstd's built in + * benchmarking tool to test the dictionary effectiveness. + * + * # Benchmark levels 1-3 without a dictionary + * zstd -b1e3 -r /path/to/my/files + * # Benchmark levels 1-3 with a dictioanry + * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary + * + * When should I retrain a dictionary? + * ----------------------------------- + * + * You should retrain a dictionary when its effectiveness drops. Dictionary + * effectiveness drops as the data you are compressing changes. Generally, we do + * expect dictionaries to "decay" over time, as your data changes, but the rate + * at which they decay depends on your use case. Internally, we regularly + * retrain dictionaries, and if the new dictionary performs significantly + * better than the old dictionary, we will ship the new dictionary. + * + * I have a raw content dictionary, how do I turn it into a zstd dictionary? + * ------------------------------------------------------------------------- + * + * If you have a raw content dictionary, e.g. by manually constructing it, or + * using a third-party dictionary builder, you can turn it into a zstd + * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to + * provide some samples of the data. It will add the zstd header to the + * raw content, which contains a dictionary ID and entropy tables, which + * will improve compression ratio, and allow zstd to write the dictionary ID + * into the frame, if you so choose. + * + * Do I have to use zstd's dictionary builder? + * ------------------------------------------- + * + * No! You can construct dictionary content however you please, it is just + * bytes. It will always be valid as a raw content dictionary. If you want + * a zstd dictionary, which can improve compression ratio, use + * `ZDICT_finalizeDictionary()`. + * + * What is the attack surface of a zstd dictionary? + * ------------------------------------------------ + * + * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so + * zstd should never crash, or access out-of-bounds memory no matter what + * the dictionary is. However, if an attacker can control the dictionary + * during decompression, they can cause zstd to generate arbitrary bytes, + * just like if they controlled the compressed data. + * + ******************************************************************************/ + /*! ZDICT_trainFromBuffer(): * Train a dictionary from an array of samples. @@ -64,7 +203,14 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap typedef struct { int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */ unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ - unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value) */ + unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value) + * NOTE: The zstd format reserves some dictionary IDs for future use. + * You may use them in private settings, but be warned that they + * may be used by zstd in a public dictionary registry in the future. + * These dictionary IDs are: + * - low range : <= 32767 + * - high range : >= (2^31) + */ } ZDICT_params_t; /*! ZDICT_finalizeDictionary(): @@ -264,10 +410,11 @@ typedef struct { * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. */ ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( - void *dictBuffer, size_t dictBufferCapacity, - const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, + void* dictBuffer, size_t dictBufferCapacity, + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters); + /* Deprecation warnings */ /* It is generally possible to disable deprecation warnings from compiler, for example with -Wno-deprecated-declarations for gcc diff --git a/zstd.h b/zstd.h index b0ecdf5..ebe2e04 100644 --- a/zstd.h +++ b/zstd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -71,8 +71,8 @@ extern "C" { /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 -#define ZSTD_VERSION_MINOR 4 -#define ZSTD_VERSION_RELEASE 8 +#define ZSTD_VERSION_MINOR 5 +#define ZSTD_VERSION_RELEASE 0 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) /*! ZSTD_versionNumber() : @@ -109,7 +109,6 @@ ZSTDLIB_API const char* ZSTD_versionString(void); #define ZSTD_BLOCKSIZE_MAX (1<= first frame size * @return : the compressed size of the first frame starting at `src`, @@ -180,8 +179,9 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ -ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ /*************************************** @@ -199,7 +199,7 @@ ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compres */ typedef struct ZSTD_CCtx_s ZSTD_CCtx; ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); -ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer */ /*! ZSTD_compressCCtx() : * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. @@ -222,7 +222,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, * Use one context per thread for parallel execution. */ typedef struct ZSTD_DCtx_s ZSTD_DCtx; ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); -ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer */ /*! ZSTD_decompressDCtx() : * Same as ZSTD_decompress(), @@ -234,9 +234,9 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, const void* src, size_t srcSize); -/*************************************** -* Advanced compression API -***************************************/ +/********************************************* +* Advanced compression API (Requires v1.4.0+) +**********************************************/ /* API design : * Parameters are pushed one by one into an existing context, @@ -266,7 +266,6 @@ typedef enum { ZSTD_fast=1, Only the order (from fast to strong) is guaranteed */ } ZSTD_strategy; - typedef enum { /* compression parameters @@ -332,7 +331,6 @@ typedef enum { * The higher the value of selected strategy, the more complex it is, * resulting in stronger and slower compression. * Special: value 0 means "use default strategy". */ - /* LDM mode parameters */ ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. * This parameter is designed to improve compression ratio @@ -389,7 +387,7 @@ typedef enum { ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. * 0 means default, which is dynamically determined based on compression parameters. - * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * Job size must be a minimum of overlap size, or ZSTDMT_JOBSIZE_MIN (= 512 KB), whichever is largest. * The minimum size is automatically and transparently enforced. */ ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. @@ -419,6 +417,8 @@ typedef enum { * ZSTD_c_stableOutBuffer * ZSTD_c_blockDelimiters * ZSTD_c_validateSequences + * ZSTD_c_splitBlocks + * ZSTD_c_useRowMatchFinder * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -434,7 +434,10 @@ typedef enum { ZSTD_c_experimentalParam9=1006, ZSTD_c_experimentalParam10=1007, ZSTD_c_experimentalParam11=1008, - ZSTD_c_experimentalParam12=1009 + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, + ZSTD_c_experimentalParam15=1012 } ZSTD_cParameter; typedef struct { @@ -519,9 +522,9 @@ ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, const void* src, size_t srcSize); -/*************************************** -* Advanced decompression API -***************************************/ +/*********************************************** +* Advanced decompression API (Requires v1.4.0+) +************************************************/ /* The advanced API pushes parameters one by one into an existing DCtx context. * Parameters are sticky, and remain valid for all following frames @@ -546,12 +549,14 @@ typedef enum { * ZSTD_d_format * ZSTD_d_stableOutBuffer * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly */ ZSTD_d_experimentalParam1=1000, ZSTD_d_experimentalParam2=1001, - ZSTD_d_experimentalParam3=1002 + ZSTD_d_experimentalParam3=1002, + ZSTD_d_experimentalParam4=1003 } ZSTD_dParameter; @@ -665,7 +670,7 @@ typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ /*===== ZSTD_CStream management functions =====*/ ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); -ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); /* accept NULL pointer */ /*===== Streaming compression functions =====*/ typedef enum { @@ -681,7 +686,7 @@ typedef enum { : note : multithreaded compression will block to flush as much output as possible. */ } ZSTD_EndDirective; -/*! ZSTD_compressStream2() : +/*! ZSTD_compressStream2() : Requires v1.4.0+ * Behaves about the same as ZSTD_compressStream, with additional control on end directive. * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) @@ -727,11 +732,11 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output /* ***************************************************************************** - * This following is a legacy streaming API. + * This following is a legacy streaming API, available since v1.0+ . * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). * It is redundant, but remains fully supported. - * Advanced parameters and dictionary compression can only be used through the - * new API. + * Streaming in combination with advanced parameters and dictionary compression + * can only be used through the new API. ******************************************************************************/ /*! @@ -786,7 +791,7 @@ typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ /*===== ZSTD_DStream management functions =====*/ ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); -ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer */ /*===== Streaming decompression functions =====*/ @@ -809,7 +814,7 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output /*! ZSTD_compress_usingDict() : * Compression at an explicit compression level using a Dictionary. * A dictionary can be any arbitrary data segment (also called a prefix), - * or a buffer with specified information (see dictBuilder/zdict.h). + * or a buffer with specified information (see zdict.h). * Note : This function loads the dictionary, resulting in significant startup delay. * It's intended for a dictionary used only once. * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ @@ -852,7 +857,8 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize int compressionLevel); /*! ZSTD_freeCDict() : - * Function frees memory allocated by ZSTD_createCDict(). */ + * Function frees memory allocated by ZSTD_createCDict(). + * If a NULL pointer is passed, no operation is performed. */ ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); /*! ZSTD_compress_usingCDict() : @@ -874,7 +880,8 @@ typedef struct ZSTD_DDict_s ZSTD_DDict; ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); /*! ZSTD_freeDDict() : - * Function frees memory allocated with ZSTD_createDDict() */ + * Function frees memory allocated with ZSTD_createDDict() + * If a NULL pointer is passed, no operation is performed. */ ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); /*! ZSTD_decompress_usingDDict() : @@ -890,19 +897,25 @@ ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, * Dictionary helper functions *******************************/ -/*! ZSTD_getDictID_fromDict() : +/*! ZSTD_getDictID_fromDict() : Requires v1.4.0+ * Provides the dictID stored within dictionary. * if @return == 0, the dictionary is not conformant with Zstandard specification. * It can still be loaded, but as a content-only dictionary. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); -/*! ZSTD_getDictID_fromDDict() : +/*! ZSTD_getDictID_fromCDict() : Requires v1.5.0+ + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); + +/*! ZSTD_getDictID_fromDDict() : Requires v1.4.0+ * Provides the dictID of the dictionary loaded into `ddict`. * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); -/*! ZSTD_getDictID_fromFrame() : +/*! ZSTD_getDictID_fromFrame() : Requires v1.4.0+ * Provides the dictID required to decompressed the frame stored within `src`. * If @return == 0, the dictID could not be decoded. * This could for one of the following reasons : @@ -916,7 +929,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); /******************************************************************************* - * Advanced dictionary and prefix API + * Advanced dictionary and prefix API (Requires v1.4.0+) * * This API allows dictionaries to be used with ZSTD_compress2(), * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and @@ -925,7 +938,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); ******************************************************************************/ -/*! ZSTD_CCtx_loadDictionary() : +/*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+ * Create an internal CDict from `dict` buffer. * Decompression will have to use same dictionary. * @result : 0, or an error code (which can be tested with ZSTD_isError()). @@ -944,11 +957,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); * to precisely select how dictionary content must be interpreted. */ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); -/*! ZSTD_CCtx_refCDict() : +/*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ * Reference a prepared dictionary, to be used for all next compressed frames. * Note that compression parameters are enforced from within CDict, * and supersede any compression parameter previously set within CCtx. - * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. * The dictionary will remain valid for future compressed frames using same CCtx. * @result : 0, or an error code (which can be tested with ZSTD_isError()). @@ -958,7 +971,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, s * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); -/*! ZSTD_CCtx_refPrefix() : +/*! ZSTD_CCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) for next compressed frame. * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). * Decompression will need same prefix to properly regenerate data. @@ -979,7 +992,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize); -/*! ZSTD_DCtx_loadDictionary() : +/*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ * Create an internal DDict from dict buffer, * to be used to decompress next frames. * The dictionary remains valid for all future frames, until explicitly invalidated. @@ -996,9 +1009,16 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, */ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); -/*! ZSTD_DCtx_refDDict() : +/*! ZSTD_DCtx_refDDict() : Requires v1.4.0+ * Reference a prepared dictionary, to be used to decompress next frames. * The dictionary remains active for decompression of future frames using same DCtx. + * + * If called with ZSTD_d_refMultipleDDicts enabled, repeated calls of this function + * will store the DDict references in a table, and the DDict used for decompression + * will be determined at decompression time, as per the dict ID in the frame. + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * * @result : 0, or an error code (which can be tested with ZSTD_isError()). * Note 1 : Currently, only one dictionary can be managed. * Referencing a new dictionary effectively "discards" any previous one. @@ -1007,7 +1027,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s */ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); -/*! ZSTD_DCtx_refPrefix() : +/*! ZSTD_DCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) to decompress next frame. * This is the reverse operation of ZSTD_CCtx_refPrefix(), * and must use the same prefix as the one used during compression. @@ -1028,7 +1048,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, /* === Memory management === */ -/*! ZSTD_sizeof_*() : +/*! ZSTD_sizeof_*() : Requires v1.4.0+ * These functions give the _current_ memory usage of selected object. * Note that object memory usage can evolve (increase or decrease) over time. */ ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); @@ -1053,6 +1073,28 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY +/* Deprecation warnings : + * Should these warnings be a problem, it is generally possible to disable them, + * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. + * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. + */ +#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +# define ZSTD_DEPRECATED(message) ZSTDLIB_API /* disable deprecation warnings */ +#else +# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +# define ZSTD_DEPRECATED(message) [[deprecated(message)]] ZSTDLIB_API +# elif (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +# define ZSTD_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated(message))) +# elif defined(__GNUC__) && (__GNUC__ >= 3) +# define ZSTD_DEPRECATED(message) ZSTDLIB_API __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define ZSTD_DEPRECATED(message) ZSTDLIB_API __declspec(deprecated(message)) +# else +# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +# define ZSTD_DEPRECATED(message) ZSTDLIB_API +# endif +#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ + /**************************************************************************************** * experimental API (static linking only) **************************************************************************************** @@ -1115,9 +1157,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); #define ZSTD_SRCSIZEHINT_MIN 0 #define ZSTD_SRCSIZEHINT_MAX INT_MAX -/* internal */ -#define ZSTD_HASHLOG3_MAX 17 - /* --- Advanced types --- */ @@ -1205,6 +1244,12 @@ typedef enum { ZSTD_d_ignoreChecksum = 1 } ZSTD_forceIgnoreChecksum_e; +typedef enum { + /* Note: this enum controls ZSTD_d_refMultipleDDicts */ + ZSTD_rmd_refSingleDDict = 0, + ZSTD_rmd_refMultipleDDicts = 1 +} ZSTD_refMultipleDDicts_e; + typedef enum { /* Note: this enum and the behavior it controls are effectively internal * implementation details of the compressor. They are expected to continue @@ -1253,6 +1298,11 @@ typedef enum { ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */ } ZSTD_literalCompressionMode_e; +typedef enum { + ZSTD_urm_auto = 0, /* Automatically determine whether or not we use row matchfinder */ + ZSTD_urm_disableRowMatchFinder = 1, /* Never use row matchfinder */ + ZSTD_urm_enableRowMatchFinder = 2 /* Always use row matchfinder when applicable */ +} ZSTD_useRowMatchFinderMode_e; /*************************************** * Frame size functions @@ -1286,7 +1336,7 @@ ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t * `srcSize` must be the _exact_ size of this series * (i.e. there should be a frame boundary at `src + srcSize`) * @return : - upper-bound for the decompressed size of all data in all successive frames - * - if an error occured: ZSTD_CONTENTSIZE_ERROR + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR * * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. @@ -1372,6 +1422,43 @@ ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size const void* src, size_t srcSize); +/*! ZSTD_writeSkippableFrame() : + * Generates a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * Skippable frames begin with a a 4-byte magic number. There are 16 possible choices of magic number, + * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. + * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so + * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. + * + * Returns an error if destination buffer is not large enough, if the source size is not representable + * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, unsigned magicVariant); + +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, + const void* src, size_t srcSize); + +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + */ +ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + + + /*************************************** * Memory management ***************************************/ @@ -1506,13 +1593,14 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictS * Note that the lifetime of such pool must exist while being used. * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL argument value * to use an internal thread pool). - * ZSTD_freeThreadPool frees a thread pool. + * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. */ typedef struct POOL_ctx_s ZSTD_threadPool; ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); -ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); +ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept NULL pointer */ ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPool* pool); + /* * This API is temporary and is expected to change or disappear in the future! */ @@ -1523,10 +1611,12 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( const ZSTD_CCtx_params* cctxParams, ZSTD_customMem customMem); -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, - ZSTD_dictLoadMethod_e dictLoadMethod, - ZSTD_dictContentType_e dictContentType, - ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + /*************************************** * Advanced compression functions @@ -1540,12 +1630,6 @@ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictS * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); -/*! ZSTD_getDictID_fromCDict() : - * Provides the dictID of the dictionary loaded into `cdict`. - * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. - * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ -ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); - /*! ZSTD_getCParams() : * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. * `estimatedSrcSize` value is optional, select 0 if not known */ @@ -1572,18 +1656,20 @@ ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParame /*! ZSTD_compress_advanced() : * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2") +size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict,size_t dictSize, ZSTD_parameters params); /*! ZSTD_compress_usingCDict_advanced() : - * Note : this function is now REDUNDANT. + * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation warning in some future version */ -ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, @@ -1645,7 +1731,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre /* Controls how the literals are compressed (default is auto). * The value must be of type ZSTD_literalCompressionMode_e. - * See ZSTD_literalCompressionMode_t enum definition for details. + * See ZSTD_literalCompressionMode_e enum definition for details. */ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 @@ -1797,12 +1883,52 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre */ #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 +/* ZSTD_c_splitBlocks + * Default is 0 == disabled. Set to 1 to enable block splitting. + * + * Will attempt to split blocks in order to improve compression ratio at the cost of speed. + */ +#define ZSTD_c_splitBlocks ZSTD_c_experimentalParam13 + +/* ZSTD_c_useRowMatchFinder + * Default is ZSTD_urm_auto. + * Controlled with ZSTD_useRowMatchFinderMode_e enum. + * + * By default, in ZSTD_urm_auto, when finalizing the compression parameters, the library + * will decide at runtime whether to use the row-based matchfinder based on support for SIMD + * instructions as well as the windowLog. + * + * Set to ZSTD_urm_disableRowMatchFinder to never use row-based matchfinder. + * Set to ZSTD_urm_enableRowMatchFinder to force usage of row-based matchfinder. + */ +#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14 + +/* ZSTD_c_deterministicRefPrefix + * Default is 0 == disabled. Set to 1 to enable. + * + * Zstd produces different results for prefix compression when the prefix is + * directly adjacent to the data about to be compressed vs. when it isn't. + * This is because zstd detects that the two buffers are contiguous and it can + * use a more efficient match finding algorithm. However, this produces different + * results than when the two buffers are non-contiguous. This flag forces zstd + * to always load the prefix in non-contiguous mode, even if it happens to be + * adjacent to the data, to guarantee determinism. + * + * If you really care about determinism when using a dictionary or prefix, + * like when doing delta compression, you should select this option. It comes + * at a speed penalty of about ~2.5% if the dictionary and data happened to be + * contiguous, and is free if they weren't contiguous. We don't expect that + * intentionally making the dictionary and data contiguous will be worth the + * cost to memcpy() the data. + */ +#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. * @return : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); +ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); /*! ZSTD_CCtx_params : @@ -1817,13 +1943,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param * These parameters will be applied to * all subsequent frames. * - ZSTD_compressStream2() : Do compression using the CCtx. - * - ZSTD_freeCCtxParams() : Free the memory. + * - ZSTD_freeCCtxParams() : Free the memory, accept NULL pointer. * * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() * for static allocation of CCtx for single-threaded compression. */ ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); -ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* accept NULL pointer */ /*! ZSTD_CCtxParams_reset() : * Reset params to default values. @@ -1842,7 +1968,7 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compre */ ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); -/*! ZSTD_CCtxParams_setParameter() : +/*! ZSTD_CCtxParams_setParameter() : Requires v1.4.0+ * Similar to ZSTD_CCtx_setParameter. * Set one compression parameter, selected by enum ZSTD_cParameter. * Parameters must be applied to a ZSTD_CCtx using @@ -1857,7 +1983,7 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_c * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. * @result : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); +ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); /*! ZSTD_CCtx_setParametersUsingCCtxParams() : * Apply a set of ZSTD_CCtx_params to the compression context. @@ -1983,12 +2109,38 @@ ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param */ #define ZSTD_d_forceIgnoreChecksum ZSTD_d_experimentalParam3 +/* ZSTD_d_refMultipleDDicts + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable + * + * If enabled and dctx is allocated on the heap, then additional memory will be allocated + * to store references to multiple ZSTD_DDict. That is, multiple calls of ZSTD_refDDict() + * using a given ZSTD_DCtx, rather than overwriting the previous DDict reference, will instead + * store all references. At decompression time, the appropriate dictID is selected + * from the set of DDicts based on the dictID in the frame. + * + * Usage is simply calling ZSTD_refDDict() on multiple dict buffers. + * + * Param has values of byte ZSTD_refMultipleDDicts_e + * + * WARNING: Enabling this parameter and calling ZSTD_DCtx_refDDict(), will trigger memory + * allocation for the hash table. ZSTD_freeDCtx() also frees this memory. + * Memory is allocated as per ZSTD_DCtx::customMem. + * + * Although this function allocates memory for the table, the user is still responsible for + * memory management of the underlying ZSTD_DDict* themselves. + */ +#define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). * Instruct the decoder context about what kind of data to decode next. * This instruction is mandatory to decode data without a fully-formed header, * such ZSTD_f_zstd1_magicless for example. * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); +ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); /*! ZSTD_decompressStream_simpleArgs() : * Same as ZSTD_decompressStream(), @@ -2012,7 +2164,7 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( /*===== Advanced Streaming compression functions =====*/ /*! ZSTD_initCStream_srcSize() : - * This function is deprecated, and equivalent to: + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); @@ -2021,15 +2173,15 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( * pledgedSrcSize must be correct. If it is not known at init time, use * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, * "0" also disables frame content size field. It may be enabled in the future. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); /*! ZSTD_initCStream_usingDict() : - * This function is deprecated, and is equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); @@ -2038,15 +2190,15 @@ ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, * dict == NULL or dictSize < 8, in which case no dict is used. * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); /*! ZSTD_initCStream_advanced() : - * This function is deprecated, and is approximately equivalent to: + * This function is DEPRECATED, and is approximately equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * // Pseudocode: Set each zstd parameter and leave the rest as-is. * for ((param, value) : params) { @@ -2058,23 +2210,24 @@ ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. * pledgedSrcSize must be correct. * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*! ZSTD_initCStream_usingCDict() : - * This function is deprecated, and equivalent to: + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, cdict); * * note : cdict will just be referenced, and must outlive compression session - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); /*! ZSTD_initCStream_usingCDict_advanced() : * This function is DEPRECATED, and is approximately equivalent to: @@ -2089,18 +2242,21 @@ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDi * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. * pledgedSrcSize must be correct. If srcSize is not known at init time, use * value ZSTD_CONTENTSIZE_UNKNOWN. - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize); /*! ZSTD_resetCStream() : - * This function is deprecated, and is equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * Note: ZSTD_resetCStream() interprets pledgedSrcSize == 0 as ZSTD_CONTENTSIZE_UNKNOWN, but + * ZSTD_CCtx_setPledgedSrcSize() does not do the same, so ZSTD_CONTENTSIZE_UNKNOWN must be + * explicitly specified. * * start a new frame, using same parameters from previous frame. * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. @@ -2110,9 +2266,10 @@ ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. * @return : 0, or an error code (which can be tested using ZSTD_isError()) - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); typedef struct { @@ -2199,8 +2356,7 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); ZSTD_CCtx object can be re-used multiple times within successive compression operations. Start by initializing a context. - Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, - or ZSTD_compressBegin_advanced(), for finer parameter control. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() Then, consume your input using ZSTD_compressContinue(). @@ -2225,15 +2381,17 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); /*===== Buffer-less streaming compression functions =====*/ ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); -ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - +/* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ /** Buffer-less streaming decompression (synchronous mode) diff --git a/zstd_common.c b/zstd_common.c index 939e9f0..3d7e35b 100644 --- a/zstd_common.c +++ b/zstd_common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_compress.c b/zstd_compress.c index dd6f68d..fc2d314 100644 --- a/zstd_compress.c +++ b/zstd_compress.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -42,6 +42,18 @@ # define ZSTD_COMPRESS_HEAPMODE 0 #endif +/*! + * ZSTD_HASHLOG3_MAX : + * Maximum size of the hash table dedicated to find 3-bytes matches, + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. + * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ +#ifndef ZSTD_HASHLOG3_MAX +# define ZSTD_HASHLOG3_MAX 17 +#endif /*-************************************* * Helper functions @@ -72,6 +84,10 @@ struct ZSTD_CDict_s { ZSTD_customMem customMem; U32 dictID; int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ + ZSTD_useRowMatchFinderMode_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use + * row-based matchfinder. Unless the cdict is reloaded, we will use + * the same greedy/lazy matchfinder at compression time. + */ }; /* typedef'd to ZSTD_CDict within "zstd.h" */ ZSTD_CCtx* ZSTD_createCCtx(void) @@ -202,6 +218,49 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) /* private API call, for dictBuilder only */ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } +/* Returns true if the strategy supports using a row based matchfinder */ +static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { + return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2); +} + +/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_useRowMatchFinderMode_e mode) { + assert(mode != ZSTD_urm_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_urm_enableRowMatchFinder); +} + +/* Returns row matchfinder usage enum given an initial mode and cParams */ +static ZSTD_useRowMatchFinderMode_e ZSTD_resolveRowMatchFinderMode(ZSTD_useRowMatchFinderMode_e mode, + const ZSTD_compressionParameters* const cParams) { +#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) + int const kHasSIMD128 = 1; +#else + int const kHasSIMD128 = 0; +#endif + if (mode != ZSTD_urm_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ + mode = ZSTD_urm_disableRowMatchFinder; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; + if (kHasSIMD128) { + if (cParams->windowLog > 14) mode = ZSTD_urm_enableRowMatchFinder; + } else { + if (cParams->windowLog > 17) mode = ZSTD_urm_enableRowMatchFinder; + } + return mode; +} + +/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ +static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + const ZSTD_useRowMatchFinderMode_e useRowMatchFinder, + const U32 forDDSDict) { + assert(useRowMatchFinder != ZSTD_urm_auto); + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. + * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder. + */ + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); +} + /* Returns 1 if compression parameters are such that we should * enable long distance matching (wlog >= 27, strategy >= btopt). * Returns 0 otherwise. @@ -210,6 +269,14 @@ static U32 ZSTD_CParams_shouldEnableLdm(const ZSTD_compressionParameters* const return cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27; } +/* Returns 1 if compression parameters are such that we should + * enable blockSplitter (wlog >= 17, strategy >= btopt). + * Returns 0 otherwise. + */ +static U32 ZSTD_CParams_useBlockSplitter(const ZSTD_compressionParameters* const cParams) { + return cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17; +} + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ZSTD_compressionParameters cParams) { @@ -218,6 +285,7 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT); cctxParams.cParams = cParams; + /* Adjust advanced params according to cParams */ if (ZSTD_CParams_shouldEnableLdm(&cParams)) { DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including LDM into cctx params"); cctxParams.ldmParams.enableLdm = 1; @@ -227,6 +295,12 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( assert(cctxParams.ldmParams.hashRateLog < 32); } + if (ZSTD_CParams_useBlockSplitter(&cParams)) { + DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including block splitting into cctx params"); + cctxParams.splitBlocks = 1; + } + + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); assert(!ZSTD_checkCParams(cParams)); return cctxParams; } @@ -269,29 +343,59 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) return 0; } +#define ZSTD_NO_CLEVEL 0 + +/** + * Initializes the cctxParams from params and compressionLevel. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) +{ + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->cParams = params->cParams; + cctxParams->fParams = params->fParams; + /* Should not matter, as all cParams are presumed properly defined. + * But, set it for tracing anyway. + */ + cctxParams->compressionLevel = compressionLevel; + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d", cctxParams->useRowMatchFinder); + + if (ZSTD_CParams_shouldEnableLdm(¶ms->cParams)) { + /* Enable LDM by default for optimal parser and window size >= 128MB */ + DEBUGLOG(4, "LDM enabled by default (window size >= 128MB, strategy >= btopt)"); + cctxParams->ldmParams.enableLdm = 1; + } + + if (ZSTD_CParams_useBlockSplitter(¶ms->cParams)) { + DEBUGLOG(4, "Block splitter enabled by default (window size >= 128K, strategy >= btopt)"); + cctxParams->splitBlocks = 1; + } +} + size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) { RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); - ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); - assert(!ZSTD_checkCParams(params.cParams)); - cctxParams->cParams = params.cParams; - cctxParams->fParams = params.fParams; - cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + ZSTD_CCtxParams_init_internal(cctxParams, ¶ms, ZSTD_NO_CLEVEL); return 0; } -/* ZSTD_assignParamsToCCtxParams() : - * params is presumed valid at this stage */ -static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams( - const ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +/** + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. + * @param param Validated zstd parameters. + */ +static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) { - ZSTD_CCtx_params ret = *cctxParams; assert(!ZSTD_checkCParams(params->cParams)); - ret.cParams = params->cParams; - ret.fParams = params->fParams; - ret.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ - return ret; + cctxParams->cParams = params->cParams; + cctxParams->fParams = params->fParams; + /* Should not matter, as all cParams are presumed properly defined. + * But, set it for tracing anyway. + */ + cctxParams->compressionLevel = ZSTD_NO_CLEVEL; } ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) @@ -468,6 +572,21 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = 1; return bounds; + case ZSTD_c_splitBlocks: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound = (int)ZSTD_urm_auto; + bounds.upperBound = (int)ZSTD_urm_enableRowMatchFinder; + return bounds; + + case ZSTD_c_deterministicRefPrefix: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + default: bounds.error = ERROR(parameter_unsupported); return bounds; @@ -529,6 +648,9 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: + case ZSTD_c_splitBlocks: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: default: return 0; } @@ -581,6 +703,9 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: + case ZSTD_c_splitBlocks: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); @@ -792,17 +917,32 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, CCtxParams->validateSequences = value; return CCtxParams->validateSequences; + case ZSTD_c_splitBlocks: + BOUNDCHECK(ZSTD_c_splitBlocks, value); + CCtxParams->splitBlocks = value; + return CCtxParams->splitBlocks; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); + CCtxParams->useRowMatchFinder = (ZSTD_useRowMatchFinderMode_e)value; + return CCtxParams->useRowMatchFinder; + + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; + return CCtxParams->deterministicRefPrefix; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } -size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value) +size_t ZSTD_CCtx_getParameter(ZSTD_CCtx const* cctx, ZSTD_cParameter param, int* value) { return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value); } size_t ZSTD_CCtxParams_getParameter( - ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, int* value) + ZSTD_CCtx_params const* CCtxParams, ZSTD_cParameter param, int* value) { switch(param) { @@ -915,6 +1055,15 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_validateSequences : *value = (int)CCtxParams->validateSequences; break; + case ZSTD_c_splitBlocks : + *value = (int)CCtxParams->splitBlocks; + break; + case ZSTD_c_useRowMatchFinder : + *value = (int)CCtxParams->useRowMatchFinder; + break; + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -1188,15 +1337,26 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); assert(ZSTD_checkCParams(cPar)==0); - if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) - srcSize = minSrcSize; - switch (mode) { - case ZSTD_cpm_noAttachDict: case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: + /* If we don't know the source size, don't make any + * assumptions about it. We will already have selected + * smaller parameters if a dictionary is in use. + */ + break; case ZSTD_cpm_createCDict: + /* Assume a small source size when creating a dictionary + * with an unkown source size. + */ + if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) + srcSize = minSrcSize; break; case ZSTD_cpm_attachDict: + /* Dictionary has its own dedicated parameters which have + * already been selected. We are selecting parameters + * for only the source. + */ dictSize = 0; break; default: @@ -1213,7 +1373,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, ZSTD_highbit32(tSize-1) + 1; if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; } - { U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); + if (srcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const dictAndWindowLog = ZSTD_dictAndWindowLog(cPar.windowLog, (U64)srcSize, (U64)dictSize); U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); if (cPar.hashLog > dictAndWindowLog+1) cPar.hashLog = dictAndWindowLog+1; if (cycleLog > dictAndWindowLog) @@ -1269,9 +1430,14 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( static size_t ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + const ZSTD_useRowMatchFinderMode_e useRowMatchFinder, + const U32 enableDedicatedDictSearch, const U32 forCCtx) { - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + /* chain table size should be 0 for fast or row-hash strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; @@ -1281,24 +1447,34 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + hSize * sizeof(U32) + h3Size * sizeof(U32); size_t const optPotentialSpace = - ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((1<strategy, useRowMatchFinder) + ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) + : 0; size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) ? optPotentialSpace : 0; + size_t const slackSpace = ZSTD_cwksp_slack_space_required(); + + /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */ + ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4); + assert(useRowMatchFinder != ZSTD_urm_auto); + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", (U32)chainSize, (U32)hSize, (U32)h3Size); - return tableSpace + optSpace; + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; } static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_compressionParameters* cParams, const ldmParams_t* ldmParams, const int isStatic, + const ZSTD_useRowMatchFinderMode_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, const U64 pledgedSrcSize) @@ -1308,16 +1484,16 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( U32 const divider = (cParams->minMatch==3) ? 3 : 4; size_t const maxNbSeq = blockSize / divider; size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) - + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); - size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, /* forCCtx */ 1); + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); size_t const ldmSeqSpace = ldmParams->enableLdm ? - ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) @@ -1343,25 +1519,45 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) { ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); + ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, + &cParams); RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); /* estimateCCtxSize is for one-shot compression. So no buffers should * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); } size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) { - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); - return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_urm_disableRowMatchFinder; + noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_urm_enableRowMatchFinder; + rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + } } static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) { - ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); - return ZSTD_estimateCCtxSize_usingCParams(cParams); + int tier = 0; + size_t largestSize = 0; + static const unsigned long long srcSizeTiers[4] = {16 KB, 128 KB, 256 KB, ZSTD_CONTENTSIZE_UNKNOWN}; + for (; tier < 4; ++tier) { + /* Choose the set of cParams for a given level across all srcSizes that give the largest cctxSize */ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeTiers[tier], 0, ZSTD_cpm_noAttachDict); + largestSize = MAX(ZSTD_estimateCCtxSize_usingCParams(cParams), largestSize); + } + return largestSize; } size_t ZSTD_estimateCCtxSize(int compressionLevel) @@ -1369,6 +1565,7 @@ size_t ZSTD_estimateCCtxSize(int compressionLevel) int level; size_t memBudget = 0; for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + /* Ensure monotonically increasing memory usage as compression level increases */ size_t const newMB = ZSTD_estimateCCtxSize_internal(level); if (newMB > memBudget) memBudget = newMB; } @@ -1387,17 +1584,29 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; + ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, inBuffSize, outBuffSize, + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, ZSTD_CONTENTSIZE_UNKNOWN); } } size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) { - ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); - return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder = ZSTD_urm_disableRowMatchFinder; + noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + initialParams.useRowMatchFinder = ZSTD_urm_enableRowMatchFinder; + rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + } } static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) @@ -1522,20 +1731,27 @@ typedef enum { ZSTD_resetTarget_CCtx } ZSTD_resetTarget_e; + static size_t ZSTD_reset_matchState(ZSTD_matchState_t* ms, ZSTD_cwksp* ws, const ZSTD_compressionParameters* cParams, + const ZSTD_useRowMatchFinderMode_e useRowMatchFinder, const ZSTD_compResetPolicy_e crp, const ZSTD_indexResetPolicy_e forceResetIndex, const ZSTD_resetTarget_e forWho) { - size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + /* disable chain table allocation for fast or row-based strategies */ + size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, + ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict)) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize = ((size_t)1) << cParams->hashLog; U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); + assert(useRowMatchFinder != ZSTD_urm_auto); if (forceResetIndex == ZSTDirp_reset) { ZSTD_window_init(&ms->window); ZSTD_cwksp_mark_tables_dirty(ws); @@ -1574,11 +1790,23 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); } + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { + { /* Row match finder needs an additional table of hashes ("tags") */ + size_t const tagTableSize = hSize*sizeof(U16); + ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); + if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = MAX(MIN(cParams->searchLog, 6), 4); + assert(cParams->hashLog >= rowLog); + ms->rowHashLog = cParams->hashLog - rowLog; + } + } + ms->cParams = *cParams; RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, "failed a workspace allocation in ZSTD_reset_matchState"); - return 0; } @@ -1595,62 +1823,85 @@ static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); } +/** ZSTD_dictTooBig(): + * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be loaded in + * one go generically. So we ensure that in that case we reset the tables to zero, + * so that we can load as much of the dictionary as possible. + */ +static int ZSTD_dictTooBig(size_t const loadedDictSize) +{ + return loadedDictSize > ZSTD_CHUNKSIZE_MAX; +} + /*! ZSTD_resetCCtx_internal() : - note : `params` are assumed fully validated at this stage */ + * @param loadedDictSize The size of the dictionary to be loaded + * into the context, if any. If no dictionary is used, or the + * dictionary is being attached / copied, then pass 0. + * note : `params` are assumed fully validated at this stage. + */ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - ZSTD_CCtx_params params, + ZSTD_CCtx_params const* params, U64 const pledgedSrcSize, + size_t const loadedDictSize, ZSTD_compResetPolicy_e const crp, ZSTD_buffered_policy_e const zbuff) { ZSTD_cwksp* const ws = &zc->workspace; - DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", - (U32)pledgedSrcSize, params.cParams.windowLog); - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d", + (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); zc->isFirstBlock = 1; - if (params.ldmParams.enableLdm) { + /* Set applied params early so we can modify them for LDM, + * and point params at the applied params. + */ + zc->appliedParams = *params; + params = &zc->appliedParams; + + assert(params->useRowMatchFinder != ZSTD_urm_auto); + if (params->ldmParams.enableLdm) { /* Adjust long distance matching parameters */ - ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); - assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); - assert(params.ldmParams.hashRateLog < 32); - zc->ldmState.hashPower = ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength); + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); + assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog); + assert(params->ldmParams.hashRateLog < 32); } - { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider = (params.cParams.minMatch==3) ? 3 : 4; + U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; size_t const maxNbSeq = blockSize / divider; - size_t const buffOutSize = (zbuff == ZSTDb_buffered && params.outBufferMode == ZSTD_bm_buffered) + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; - size_t const buffInSize = (zbuff == ZSTDb_buffered && params.inBufferMode == ZSTD_bm_buffered) + size_t const buffInSize = (zbuff == ZSTDb_buffered && params->inBufferMode == ZSTD_bm_buffered) ? windowSize + blockSize : 0; - size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize); int const indexTooClose = ZSTD_indexTooCloseToMax(zc->blockState.matchState.window); + int const dictTooBig = ZSTD_dictTooBig(loadedDictSize); ZSTD_indexResetPolicy_e needsIndexReset = - (!indexTooClose && zc->initialized) ? ZSTDirp_continue : ZSTDirp_reset; + (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_reset : ZSTDirp_continue; size_t const neededSpace = ZSTD_estimateCCtxSize_usingCCtxParams_internal( - ¶ms.cParams, ¶ms.ldmParams, zc->staticSize != 0, + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, buffInSize, buffOutSize, pledgedSrcSize); + int resizeWorkspace; + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); - /* Check if workspace is large enough, alloc a new one if needed */ - { + { /* Check if workspace is large enough, alloc a new one if needed */ int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); - + resizeWorkspace = workspaceTooSmall || workspaceWasteful; DEBUGLOG(4, "Need %zu B workspace", neededSpace); DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); - if (workspaceTooSmall || workspaceWasteful) { + if (resizeWorkspace) { DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", ZSTD_cwksp_sizeof(ws) >> 10, neededSpace >> 10); @@ -1678,8 +1929,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, ZSTD_cwksp_clear(ws); /* init params */ - zc->appliedParams = params; - zc->blockState.matchState.cParams = params.cParams; + zc->blockState.matchState.cParams = params->cParams; zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; zc->consumedSrcSize = 0; zc->producedCSize = 0; @@ -1692,6 +1942,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, XXH64_reset(&zc->xxhState, 0); zc->stage = ZSTDcs_init; zc->dictID = 0; + zc->dictContentSize = 0; ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); @@ -1709,13 +1960,13 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); /* ldm bucketOffsets table */ - if (params.ldmParams.enableLdm) { + if (params->ldmParams.enableLdm) { /* TODO: avoid memset? */ - size_t const ldmBucketSize = - ((size_t)1) << (params.ldmParams.hashLog - - params.ldmParams.bucketSizeLog); - zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, ldmBucketSize); - ZSTD_memset(zc->ldmState.bucketOffsets, 0, ldmBucketSize); + size_t const numBuckets = + ((size_t)1) << (params->ldmParams.hashLog - + params->ldmParams.bucketSizeLog); + zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); + ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); } /* sequences storage */ @@ -1729,32 +1980,28 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, FORWARD_IF_ERROR(ZSTD_reset_matchState( &zc->blockState.matchState, ws, - ¶ms.cParams, + ¶ms->cParams, + params->useRowMatchFinder, crp, needsIndexReset, ZSTD_resetTarget_CCtx), ""); /* ldm hash table */ - if (params.ldmParams.enableLdm) { + if (params->ldmParams.enableLdm) { /* TODO: avoid memset? */ - size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; + size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); zc->maxNbLdmSequences = maxNbLdmSeq; ZSTD_window_init(&zc->ldmState.window); - ZSTD_window_clear(&zc->ldmState.window); zc->ldmState.loadedDictEnd = 0; } - /* Due to alignment, when reusing a workspace, we can actually consume - * up to 3 extra bytes for alignment. See the comments in zstd_cwksp.h - */ - assert(ZSTD_cwksp_used(ws) >= neededSpace && - ZSTD_cwksp_used(ws) <= neededSpace + 3); - + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); + zc->initialized = 1; return 0; @@ -1810,6 +2057,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { + DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); { ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams; unsigned const windowLog = params.cParams.windowLog; @@ -1825,7 +2074,9 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, cdict->dictContentSize, ZSTD_cpm_attachDict); params.cParams.windowLog = windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_makeClean, zbuff), ""); assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy); } @@ -1852,6 +2103,7 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, } } cctx->dictID = cdict->dictID; + cctx->dictContentSize = cdict->dictContentSize; /* copy block state */ ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); @@ -1868,15 +2120,17 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; assert(!cdict->matchState.dedicatedDictSearch); - - DEBUGLOG(4, "copying dictionary into context"); + DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%llu", + (unsigned long long)pledgedSrcSize); { unsigned const windowLog = params.cParams.windowLog; assert(windowLog != 0); /* Copy only compression parameters related to tables. */ params.cParams = *cdict_cParams; params.cParams.windowLog = windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + params.useRowMatchFinder = cdict->useRowMatchFinder; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff), ""); assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); @@ -1884,17 +2138,30 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, } ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); + assert(params.useRowMatchFinder != ZSTD_urm_auto); /* copy tables */ - { size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog); + { size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) + ? ((size_t)1 << cdict_cParams->chainLog) + : 0; size_t const hSize = (size_t)1 << cdict_cParams->hashLog; ZSTD_memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, hSize * sizeof(U32)); - ZSTD_memcpy(cctx->blockState.matchState.chainTable, + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { + ZSTD_memcpy(cctx->blockState.matchState.chainTable, cdict->matchState.chainTable, chainSize * sizeof(U32)); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { + size_t const tagTableSize = hSize*sizeof(U16); + ZSTD_memcpy(cctx->blockState.matchState.tagTable, + cdict->matchState.tagTable, + tagTableSize); + } } /* Zero the hashTable3, since the cdict never fills it */ @@ -1915,6 +2182,7 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, } cctx->dictID = cdict->dictID; + cctx->dictContentSize = cdict->dictContentSize; /* copy block state */ ZSTD_memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); @@ -1957,16 +2225,18 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { - DEBUGLOG(5, "ZSTD_copyCCtx_internal"); RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, "Can't copy a ctx that's not in init stage."); - + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); { ZSTD_CCtx_params params = dstCCtx->requestedParams; /* Copy only compression parameters related to tables. */ params.cParams = srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_urm_auto); + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; params.fParams = fParams; - ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff); assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); @@ -1978,7 +2248,11 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); /* copy tables */ - { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); + { size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy, + srcCCtx->appliedParams.useRowMatchFinder, + 0 /* forDDSDict */) + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) + : 0; size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; int const h3log = srcCCtx->blockState.matchState.hashLog3; size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; @@ -2005,6 +2279,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; } dstCCtx->dictID = srcCCtx->dictID; + dstCCtx->dictContentSize = srcCCtx->dictContentSize; /* copy block state */ ZSTD_memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); @@ -2091,7 +2366,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); } - if (params->cParams.strategy != ZSTD_fast) { + if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMatchFinder, (U32)ms->dedicatedDictSearch)) { U32 const chainSize = (U32)1 << params->cParams.chainLog; if (params->cParams.strategy == ZSTD_btlazy2) ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); @@ -2128,9 +2403,9 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset); mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv); } - if (seqStorePtr->longLengthID==1) + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) llCodeTable[seqStorePtr->longLengthPos] = MaxLL; - if (seqStorePtr->longLengthID==2) + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) mlCodeTable[seqStorePtr->longLengthPos] = MaxML; } @@ -2144,10 +2419,159 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) return (cctxParams->targetCBlockSize != 0); } -/* ZSTD_entropyCompressSequences_internal(): - * actually compresses both literals and sequences */ +/* ZSTD_blockSplitterEnabled(): + * Returns if block splitting param is being used + * If used, compression will do best effort to split a block in order to improve compression ratio. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) +{ + DEBUGLOG(5, "ZSTD_blockSplitterEnabled(splitBlocks=%d)", cctxParams->splitBlocks); + return (cctxParams->splitBlocks != 0); +} + +/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types + * and size of the sequences statistics + */ +typedef struct { + U32 LLtype; + U32 Offtype; + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_symbolEncodingTypeStats_t; + +/* ZSTD_buildSequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `size` field. + * Modifies `nextEntropy` to have the appropriate values as a side effect. + * nbSeq must be greater than 0. + * + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, + BYTE* dst, const BYTE* const dstEnd, + ZSTD_strategy strategy, unsigned* countWorkspace, + void* entropyWorkspace, size_t entropyWkspSize) { + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + ZSTD_symbolEncodingTypeStats_t stats; + + stats.lastCountSize = 0; + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ + { unsigned max = MaxLL; + size_t const mostFrequent = HIST_countFast_wksp(countWorkspace, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + stats.LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->litlengthCTable, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype, + countWorkspace, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, + sizeof(prevEntropy->litlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); + stats.size = countSize; + return stats; + } + if (stats.LLtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + /* build CTable for Offsets */ + { unsigned max = MaxOff; + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->offcodeCTable, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype, + countWorkspace, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, + sizeof(prevEntropy->offcodeCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); + stats.size = countSize; + return stats; + } + if (stats.Offtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + /* build CTable for MatchLengths */ + { unsigned max = MaxML; + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + stats.MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, + countWorkspace, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->matchlengthCTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype, + countWorkspace, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, + sizeof(prevEntropy->matchlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); + stats.size = countSize; + return stats; + } + if (stats.MLtype == set_compressed) + stats.lastCountSize = countSize; + op += countSize; + assert(op <= oend); + } } + stats.size = (size_t)(op-ostart); + return stats; +} + +/* ZSTD_entropyCompressSeqStore_internal(): + * compresses both literals and sequences + * Returns compressed size of block, or a zstd error. + */ +#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 MEM_STATIC size_t -ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, @@ -2161,27 +2585,29 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; - U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ const seqDef* const sequences = seqStorePtr->sequencesStart; + const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; const BYTE* const ofCodeTable = seqStorePtr->ofCode; const BYTE* const llCodeTable = seqStorePtr->llCode; const BYTE* const mlCodeTable = seqStorePtr->mlCode; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; - size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); - BYTE* seqHead; - BYTE* lastNCount = NULL; + size_t lastCountSize; entropyWorkspace = count + (MaxSeq + 1); entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); - DEBUGLOG(4, "ZSTD_entropyCompressSequences_internal (nbSeq=%zu)", nbSeq); + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); /* Compress literals */ { const BYTE* const literals = seqStorePtr->litStart; + size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; + size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); size_t const litSize = (size_t)(seqStorePtr->lit - literals); size_t const cSize = ZSTD_compressLiterals( &prevEntropy->huf, &nextEntropy->huf, @@ -2190,7 +2616,7 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, op, dstCapacity, literals, litSize, entropyWorkspace, entropyWkspSize, - bmi2); + bmi2, suspectUncompressible); FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); assert(cSize <= dstCapacity); op += cSize; @@ -2216,95 +2642,20 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); return (size_t)(op - ostart); } - - /* seqHead : flags for FSE encoding type */ - seqHead = op++; - assert(op <= oend); - - /* convert length/distances into codes */ - ZSTD_seqToCodes(seqStorePtr); - /* build CTable for Literal Lengths */ - { unsigned max = MaxLL; - size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ - DEBUGLOG(5, "Building LL table"); - nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode; - LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, - count, max, mostFrequent, nbSeq, - LLFSELog, prevEntropy->fse.litlengthCTable, - LL_defaultNorm, LL_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(set_basic < set_compressed && set_rle < set_compressed); - assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, - count, max, llCodeTable, nbSeq, - LL_defaultNorm, LL_defaultNormLog, MaxLL, - prevEntropy->fse.litlengthCTable, - sizeof(prevEntropy->fse.litlengthCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); - if (LLtype == set_compressed) - lastNCount = op; - op += countSize; - assert(op <= oend); - } } - /* build CTable for Offsets */ - { unsigned max = MaxOff; - size_t const mostFrequent = HIST_countFast_wksp( - count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ - /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ - ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; - DEBUGLOG(5, "Building OF table"); - nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode; - Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, - count, max, mostFrequent, nbSeq, - OffFSELog, prevEntropy->fse.offcodeCTable, - OF_defaultNorm, OF_defaultNormLog, - defaultPolicy, strategy); - assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, - count, max, ofCodeTable, nbSeq, - OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, - prevEntropy->fse.offcodeCTable, - sizeof(prevEntropy->fse.offcodeCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); - if (Offtype == set_compressed) - lastNCount = op; - op += countSize; - assert(op <= oend); - } } - /* build CTable for MatchLengths */ - { unsigned max = MaxML; - size_t const mostFrequent = HIST_countFast_wksp( - count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); - nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode; - MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, - count, max, mostFrequent, nbSeq, - MLFSELog, prevEntropy->fse.matchlengthCTable, - ML_defaultNorm, ML_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, - count, max, mlCodeTable, nbSeq, - ML_defaultNorm, ML_defaultNormLog, MaxML, - prevEntropy->fse.matchlengthCTable, - sizeof(prevEntropy->fse.matchlengthCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); - if (MLtype == set_compressed) - lastNCount = op; - op += countSize; - assert(op <= oend); - } } - - *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + { + ZSTD_symbolEncodingTypeStats_t stats; + BYTE* seqHead = op++; + /* build stats for sequences */ + stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; + } { size_t const bitstreamSize = ZSTD_encodeSequences( op, (size_t)(oend - op), @@ -2324,9 +2675,9 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, * In this exceedingly rare case, we will simply emit an uncompressed * block, since it isn't worth optimizing. */ - if (lastNCount && (op - lastNCount) < 4) { - /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ - assert(op - lastNCount == 3); + if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { + /* lastCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(lastCountSize + bitstreamSize == 3); DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " "emitting an uncompressed block."); return 0; @@ -2338,7 +2689,7 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, } MEM_STATIC size_t -ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, @@ -2347,7 +2698,7 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, void* entropyWorkspace, size_t entropyWkspSize, int bmi2) { - size_t const cSize = ZSTD_entropyCompressSequences_internal( + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( seqStorePtr, prevEntropy, nextEntropy, cctxParams, dst, dstCapacity, entropyWorkspace, entropyWkspSize, bmi2); @@ -2357,20 +2708,20 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, */ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) return 0; /* block not compressed */ - FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSequences_internal failed"); + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); /* Check compressibility */ { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); if (cSize >= maxCSize) return 0; /* block not compressed */ } - DEBUGLOG(4, "ZSTD_entropyCompressSequences() cSize: %zu\n", cSize); + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); return cSize; } /* ZSTD_selectBlockCompressor() : * Not static, but internal use only (used by long distance matcher) * assumption : strat is a valid strategy */ -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode) +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_useRowMatchFinderMode_e useRowMatchFinder, ZSTD_dictMode_e dictMode) { static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { { ZSTD_compressBlock_fast /* default for 0 */, @@ -2418,7 +2769,28 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMo ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); - selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { + { ZSTD_compressBlock_greedy_row, + ZSTD_compressBlock_lazy_row, + ZSTD_compressBlock_lazy2_row }, + { ZSTD_compressBlock_greedy_extDict_row, + ZSTD_compressBlock_lazy_extDict_row, + ZSTD_compressBlock_lazy2_extDict_row }, + { ZSTD_compressBlock_greedy_dictMatchState_row, + ZSTD_compressBlock_lazy_dictMatchState_row, + ZSTD_compressBlock_lazy2_dictMatchState_row }, + { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, + ZSTD_compressBlock_lazy_dedicatedDictSearch_row, + ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } + }; + DEBUGLOG(4, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_urm_auto); + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; + } else { + selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + } assert(selectedCompressor != NULL); return selectedCompressor; } @@ -2434,7 +2806,7 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) { ssPtr->lit = ssPtr->litStart; ssPtr->sequences = ssPtr->sequencesStart; - ssPtr->longLengthID = 0; + ssPtr->longLengthType = ZSTD_llt_none; } typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; @@ -2487,6 +2859,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) ZSTD_ldm_blockCompress(&zc->externSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(zc->externSeqStore.pos <= zc->externSeqStore.size); } else if (zc->appliedParams.ldmParams.enableLdm) { @@ -2503,10 +2876,13 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) ZSTD_ldm_blockCompress(&ldmSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos == ldmSeqStore.size); } else { /* not long range mode */ - ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode); + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); ms->ldmSeqStore = NULL; lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); } @@ -2540,9 +2916,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) outSeqs[i].rep = 0; if (i == seqStore->longLengthPos) { - if (seqStore->longLengthID == 1) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { outSeqs[i].litLength += 0x10000; - } else if (seqStore->longLengthID == 2) { + } else if (seqStore->longLengthType == ZSTD_llt_matchLength) { outSeqs[i].matchLength += 0x10000; } } @@ -2653,13 +3029,730 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) return nbSeqs < 4 && nbLits < 10; } -static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) +static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) +{ + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; + bs->nextCBlock = tmp; +} + +/* Writes the block header */ +static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); +} + +/** ZSTD_buildBlockEntropyStats_literals() : + * Builds entropy for the literals. + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace + * @return : size of huffman description table or error code */ +static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_hufCTablesMetadata_t* hufMetadata, + const int disableLiteralsCompression, + void* workspace, size_t wkspSize) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; + const size_t nodeWkspSize = wkspEnd-nodeWksp; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + unsigned huffLog = HUF_TABLELOG_DEFAULT; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (disableLiteralsCompression) { + DEBUGLOG(5, "set_basic - disabled"); + hufMetadata->hType = set_basic; + return 0; + } + + /* small ? don't even attempt compression (speed opt) */ +#ifndef COMPRESS_LITERALS_SIZE_MIN +#define COMPRESS_LITERALS_SIZE_MIN 63 +#endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Scan input and build symbol stats */ + { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Validate the previous Huffman table */ + if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; + { /* Build and write the CTable */ + size_t const newCSize = HUF_estimateCompressedSize( + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); + size_t const hSize = HUF_writeCTable_wksp( + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + /* Check against repeating the previous CTable */ + if (repeat != HUF_repeat_none) { + size_t const oldCSize = HUF_estimateCompressedSize( + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); + if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { + DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_repeat; + return 0; + } + } + if (newCSize + hSize >= srcSize) { + DEBUGLOG(5, "set_basic - no gains"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_basic; + return 0; + } + DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); + hufMetadata->hType = set_compressed; + nextHuf->repeatMode = HUF_repeat_check; + return hSize; + } + } +} + + +/* ZSTD_buildDummySequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set_basic, + * and updates nextEntropy to the appropriate repeatMode. + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; + return stats; +} + +/** ZSTD_buildBlockEntropyStats_sequences() : + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. + * @return : size of fse tables or error code */ +static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + const ZSTD_fseCTables_t* prevEntropy, + ZSTD_fseCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize) { - ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; - zc->blockState.prevCBlock = zc->blockState.nextCBlock; - zc->blockState.nextCBlock = tmp; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; + unsigned* countWorkspace = (unsigned*)workspace; + unsigned* entropyWorkspace = countWorkspace + (MaxSeq + 1); + size_t entropyWorkspaceSize = wkspSize - (MaxSeq + 1) * sizeof(*countWorkspace); + ZSTD_symbolEncodingTypeStats_t stats; + + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=%zu)", nbSeq); + stats = nbSeq != 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + prevEntropy, nextEntropy, op, oend, + strategy, countWorkspace, + entropyWorkspace, entropyWorkspaceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + fseMetadata->llType = (symbolEncodingType_e) stats.LLtype; + fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype; + fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize = stats.lastCountSize; + return stats.size; +} + + +/** ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE + * + * @return : 0 on success or error code + */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize) +{ + size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_disableLiteralsCompression(cctxParams), + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, + &prevEntropy->fse, &nextEntropy->fse, + cctxParams, + &entropyMetadata->fseMetadata, + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildBlockEntropyStats_sequences failed"); + return 0; +} + +/* Returns the size estimate for the literals section (header + content) of a block */ +static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, + const ZSTD_hufCTables_t* huf, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; + size_t literalSectionHeaderSize = 3 + (litSize >= 1 KB) + (litSize >= 16 KB); + U32 singleStream = litSize < 256; + + if (hufMetadata->hType == set_basic) return litSize; + else if (hufMetadata->hType == set_rle) return 1; + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; + if (!singleStream) cLitSizeEstimate += 6; /* multi-stream huffman uses 6-byte jump table */ + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + const BYTE* codeTable, size_t nbSeq, unsigned maxCode, + const FSE_CTable* fseCTable, + const U32* additionalBits, + short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; + const BYTE* const ctStart = ctp; + const BYTE* const ctEnd = ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits = 0; + unsigned max = maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ + if (type == set_basic) { + /* We selected this encoding type, so it must be valid. */ + assert(max <= defaultMax); + (void)defaultMax; + cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); + } else if (type == set_rle) { + cSymbolTypeSizeEstimateInBits = 0; + } else if (type == set_compressed || type == set_repeat) { + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { + return nbSeq * 10; + } + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits >> 3; +} + +/* Returns the size estimate for the sequences section (header + content) of a block */ +static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, + fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, + fseTables->litlengthCTable, LL_bits, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, + fseTables->matchlengthCTable, ML_bits, + ML_defaultNorm, ML_defaultNormLog, MaxML, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +/* Returns the size estimate for a given stream of literals, of, ll, ml */ +static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntropy) { + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; +} + +/* Builds entropy statistics and uses them for blocksize estimation. + * + * Returns the estimated compressed size of the seqStore, or a zstd error. + */ +static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, const ZSTD_CCtx* zc) { + ZSTD_entropyCTablesMetadata_t entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + &entropyMetadata, + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); + return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), + &zc->blockState.nextCBlock->entropy, &entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata.hufMetadata.hType == set_compressed), 1); +} + +/* Returns literals bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { + size_t literalsBytes = 0; + size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; + } + } + return literalsBytes; +} + +/* Returns match bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { + size_t matchBytes = 0; + size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.matchLength + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; + } + } + return matchBytes; +} + +/* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. + */ +static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, + size_t startIdx, size_t endIdx) { + BYTE* const litEnd = originalSeqStore->lit; + size_t literalsBytes; + size_t literalsBytesPreceding = 0; + + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; + literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ + if (originalSeqStore->longLengthType != ZSTD_llt_none) { + if (originalSeqStore->longLengthPos < startIdx || originalSeqStore->longLengthPos > endIdx) { + resultSeqStore->longLengthType = ZSTD_llt_none; + } else { + resultSeqStore->longLengthPos -= (U32)startIdx; + } + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; + literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ + resultSeqStore->lit = litEnd; + } else { + resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; + resultSeqStore->ofCode += startIdx; +} + +/** + * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. + * offCode must be an offCode representing a repcode, therefore in the range of [0, 2]. + */ +static U32 ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) { + U32 const adjustedOffCode = offCode + ll0; + assert(offCode < ZSTD_REP_NUM); + if (adjustedOffCode == ZSTD_REP_NUM) { + /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ + assert(rep[0] > 0); + return rep[0] - 1; + } + return rep[adjustedOffCode]; +} + +/** + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise + * due to emission of RLE/raw blocks that disturb the offset history, and replaces any repcodes within + * the seqStore that may be invalid. + * + * dRepcodes are updated as would be on the decompression side. cRepcodes are updated exactly in + * accordance with the seqStore. + */ +static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, + seqStore_t* const seqStore, U32 const nbSeq) { + U32 idx = 0; + for (; idx < nbSeq; ++idx) { + seqDef* const seq = seqStore->sequencesStart + idx; + U32 const ll0 = (seq->litLength == 0); + U32 offCode = seq->offset - 1; + assert(seq->offset > 0); + if (offCode <= ZSTD_REP_MOVE) { + U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); + U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { + seq->offset = cRawOffset + ZSTD_REP_NUM; + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ + *dRepcodes = ZSTD_updateRep(dRepcodes->rep, seq->offset - 1, ll0); + *cRepcodes = ZSTD_updateRep(cRepcodes->rep, offCode, ll0); + } +} + +/* ZSTD_compressSeqStore_singleBlock(): + * Compresses a seqStore into a block with a block header, into the buffer dst. + * + * Returns the total size of that block (including header) or a ZSTD error code. + */ +static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) { + const U32 rleMaxLength = 25; + BYTE* op = (BYTE*)dst; + const BYTE* ip = (const BYTE*)src; + size_t cSize; + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ + repcodes_t const dRepOriginal = *dRep; + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit"); + cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, + srcSize, + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + + if (!zc->isFirstBlock && + cSeqsSize < rleMaxLength && + ZSTD_isRLE((BYTE const*)src, srcSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + cSeqsSize = 1; + } + + if (zc->seqCollector.collectSequences) { + ZSTD_copyBlockSequences(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } + + if (cSeqsSize == 0) { + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); + DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else if (cSeqsSize == 1) { + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); + DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize = ZSTD_blockHeaderSize + cSeqsSize; + DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +/* Struct to keep track of where we are in our recursive calls. */ +typedef struct { + U32* splitLocations; /* Array of split indices */ + size_t idx; /* The current index within splitLocations being worked on */ +} seqStoreSplits; + +#define MIN_SEQUENCES_BLOCK_SPLITTING 300 +#define MAX_NB_SPLITS 196 + +/* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. + * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then + * we do not recurse. + * + * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * + * Furthermore, the number of splits is capped by MAX_NB_SPLITS. At MAX_NB_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ +static void ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + const ZSTD_CCtx* zc, const seqStore_t* origSeqStore) { + seqStore_t fullSeqStoreChunk; + seqStore_t firstHalfSeqStore; + seqStore_t secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= MAX_NB_SPLITS) { + DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); + return; + } + ZSTD_deriveSeqStoreChunk(&fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(&firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(&secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&secondHalfSeqStore, zc); + DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; + ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqStore); + } +} + +/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. + * + * Returns the number of splits made (which equals the size of the partition table - 1). + */ +static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { + seqStoreSplits splits = {partitions, 0}; + if (nbSeq <= 4) { + DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } + ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); + splits.splitLocations[splits.idx] = nbSeq; + DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits.idx+1); + return splits.idx; +} + +/* ZSTD_compressBlock_splitBlock(): + * Attempts to split a given block into multiple blocks to improve compression ratio. + * + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ +static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, + const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + U32 partitions[MAX_NB_SPLITS]; + size_t i = 0; + size_t srcBytesTotal = 0; + size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + seqStore_t nextSeqStore; + seqStore_t currSeqStore; + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two + * separate repcode histories that simulate repcode history on compression and decompression side, + * and use the histories to determine whether we must replace a particular repcode with its raw offset. + * + * 1) cRep gets updated for each partition, regardless of whether the block was emitted as uncompressed + * or RLE. This allows us to retrieve the offset value that an invalid repcode references within + * a nocompress/RLE block. + * 2) dRep gets updated only for compressed partitions, and when a repcode gets replaced, will use + * the replacement offset value rather than the original repcode to update the repcode history. + * dRep also will be the final repcode history sent to the next block. + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ + repcodes_t dRep; + repcodes_t cRep; + ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memset(&nextSeqStore, 0, sizeof(seqStore_t)); + + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { + size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, + &dRep, &cRep, + op, dstCapacity, + ip, blockSize, + lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); + assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(&currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { + size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + + srcBytes = ZSTD_countSeqStoreLiteralsBytes(&currSeqStore) + ZSTD_countSeqStoreMatchBytes(&currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ + srcBytes += blockSize - srcBytesTotal; + lastBlockEntireSrc = lastBlock; + } else { + ZSTD_deriveSeqStoreChunk(&nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]); + } + + cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore, + &dRep, &cRep, + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); + DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; + op += cSizeChunk; + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + currSeqStore = nextSeqStore; + assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); + } + /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes + * for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); + return cSize; } +static size_t ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) { + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; + } + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); + } + + assert(zc->appliedParams.splitBlocks == 1); + cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq); + FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); + return cSize; +} + +/* ZSTD_convertBlockSequencesToSeqStore() + * Converts an array of ZSTD_Sequence* with the corresponding original src buffer into + * the seqStore of a cctx. + * + * Returns 0 on success, ZSTD_error on failure. + */ +static UNUSED_ATTR size_t ZSTD_convertBlockSequencesToSeqStore(ZSTD_CCtx* cctx, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize); + static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 frame) @@ -2683,12 +3776,12 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, if (zc->seqCollector.collectSequences) { ZSTD_copyBlockSequences(zc); - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } /* encode sequences and literals */ - cSize = ZSTD_entropyCompressSequences(&zc->seqStore, + cSize = ZSTD_entropyCompressSeqStore(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, dst, dstCapacity, @@ -2696,12 +3789,6 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, zc->bmi2); - if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); - return 0; - } - - if (frame && /* We don't want to emit our first block as a RLE even if it qualifies because * doing so will cause the decoder (cli only) to throw a "should consume all input error." @@ -2717,7 +3804,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, out: if (!ZSTD_isError(cSize) && cSize > 1) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); } /* We check that dictionaries have offset codes available for the first * block. After the first block, the offcode table might not have large @@ -2770,7 +3857,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return cSize; } } @@ -2810,9 +3897,9 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, void const* ip, void const* iend) { - if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { - U32 const maxDist = (U32)1 << params->cParams.windowLog; - U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const maxDist = (U32)1 << params->cParams.windowLog; + if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, ms->loadedDictEnd, ip, iend)) { U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); @@ -2835,7 +3922,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, * Frame is supposed already started (header already produced) * @return : compressed size, or an error code */ -static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, +static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastFrameChunk) @@ -2865,6 +3952,7 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, ZSTD_overflowCorrectIfNeeded( ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); + ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; @@ -2875,6 +3963,10 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); assert(cSize > 0); assert(cSize <= blockSize + ZSTD_blockHeaderSize); + } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_splitBlock(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock failed"); + assert(cSize > 0 || cctx->seqCollector.collectSequences == 1); } else { cSize = ZSTD_compressBlock_internal(cctx, op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, @@ -2954,6 +4046,26 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, return pos; } +/* ZSTD_writeSkippableFrame_advanced() : + * Writes out a skippable frame with the specified magic number variant (16 are supported), + * from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15, and the desired source data. + * + * Returns the total number of bytes written, or a ZSTD error code. + */ +size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, unsigned magicVariant) { + BYTE* op = (BYTE*)dst; + RETURN_ERROR_IF(dstCapacity < srcSize + ZSTD_SKIPPABLEHEADERSIZE /* Skippable frame overhead */, + dstSize_tooSmall, "Not enough room for skippable frame"); + RETURN_ERROR_IF(srcSize > (unsigned)0xFFFFFFFF, srcSize_wrong, "Src size too large for skippable frame"); + RETURN_ERROR_IF(magicVariant > 15, parameter_outOfBound, "Skippable frame magic number variant not supported"); + + MEM_writeLE32(op, (U32)(ZSTD_MAGIC_SKIPPABLE_START + magicVariant)); + MEM_writeLE32(op+4, (U32)srcSize); + ZSTD_memcpy(op+8, src, srcSize); + return srcSize + ZSTD_SKIPPABLEHEADERSIZE; +} + /* ZSTD_writeLastEmptyBlock() : * output an empty Block with end-of-frame mark to complete a frame * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) @@ -3010,11 +4122,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, if (!srcSize) return fhSize; /* do not generate an empty block if no input */ - if (!ZSTD_window_update(&ms->window, src, srcSize)) { + if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContiguous)) { + ms->forceNonContiguous = 0; ms->nextToUpdate = ms->window.dictLimit; } if (cctx->appliedParams.ldmParams.enableLdm) { - ZSTD_window_update(&cctx->ldmState.window, src, srcSize); + ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0); } if (!frame) { @@ -3082,63 +4195,86 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, { const BYTE* ip = (const BYTE*) src; const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm && ls != NULL; + + /* Assert that we the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + + if (srcSize > ZSTD_CHUNKSIZE_MAX) { + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ + U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; + /* We must have cleared our windows when our source is this large. */ + assert(ZSTD_window_isEmpty(ms->window)); + if (loadLdmDict) + assert(ZSTD_window_isEmpty(ls->window)); + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; + src = ip; + srcSize = maxDictSize; + } + } - ZSTD_window_update(&ms->window, src, srcSize); + DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + ms->forceNonContiguous = params->deterministicRefPrefix; - if (params->ldmParams.enableLdm && ls != NULL) { - ZSTD_window_update(&ls->window, src, srcSize); + if (loadLdmDict) { + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); } - /* Assert that we the ms params match the params we're being given */ - ZSTD_assertEqualCParams(params->cParams, ms->cParams); - if (srcSize <= HASH_READ_SIZE) return 0; - while (iend - ip > HASH_READ_SIZE) { - size_t const remaining = (size_t)(iend - ip); - size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX); - const BYTE* const ichunk = ip + chunk; + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); - ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); + if (loadLdmDict) + ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); - if (params->ldmParams.enableLdm && ls != NULL) - ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, ¶ms->ldmParams); - - switch(params->cParams.strategy) - { - case ZSTD_fast: - ZSTD_fillHashTable(ms, ichunk, dtlm); - break; - case ZSTD_dfast: - ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); - break; + switch(params->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, dtlm); + break; + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, iend, dtlm); + break; - case ZSTD_greedy: - case ZSTD_lazy: - case ZSTD_lazy2: - if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch) { - assert(chunk == remaining); /* must load everything in one go */ - ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE); - } else if (chunk >= HASH_READ_SIZE) { - ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); + ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE); + } else { + assert(params->useRowMatchFinder != ZSTD_urm_auto); + if (params->useRowMatchFinder == ZSTD_urm_enableRowMatchFinder) { + size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); + } else { + ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); } - break; - - case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ - case ZSTD_btopt: - case ZSTD_btultra: - case ZSTD_btultra2: - if (chunk >= HASH_READ_SIZE) - ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); - break; - - default: - assert(0); /* not possible : not a valid strategy id */ } + break; - ip = ichunk; + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + assert(srcSize >= HASH_READ_SIZE); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); + break; + + default: + assert(0); /* not possible : not a valid strategy id */ } ms->nextToUpdate = (U32)(iend - ms->window.base); @@ -3277,7 +4413,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, const BYTE* const dictEnd = dictPtr + dictSize; size_t dictID; size_t eSize; - ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= 8); assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); @@ -3348,6 +4483,10 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { + size_t const dictContentSize = cdict ? cdict->dictContentSize : dictSize; +#if ZSTD_TRACE + cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0; +#endif DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); /* params are supposed to be fully validated at this point */ assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); @@ -3362,7 +4501,8 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); } - FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize, + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + dictContentSize, ZSTDcrp_makeClean, zbuff) , ""); { size_t const dictID = cdict ? ZSTD_compress_insertDictionary( @@ -3377,6 +4517,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <= UINT_MAX); cctx->dictID = (U32)dictID; + cctx->dictContentSize = dictContentSize; } return 0; } @@ -3405,8 +4546,8 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize) { - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + ZSTD_CCtx_params cctxParams; + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, ZSTD_NO_CLEVEL); return ZSTD_compressBegin_advanced_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL /*cdict*/, @@ -3415,9 +4556,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + ZSTD_CCtx_params cctxParams; + { + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); @@ -3471,6 +4614,30 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) return op-ostart; } +void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) +{ +#if ZSTD_TRACE + if (cctx->traceCtx && ZSTD_trace_compress_end != NULL) { + int const streaming = cctx->inBuffSize > 0 || cctx->outBuffSize > 0 || cctx->appliedParams.nbWorkers > 0; + ZSTD_Trace trace; + ZSTD_memset(&trace, 0, sizeof(trace)); + trace.version = ZSTD_VERSION_NUMBER; + trace.streaming = streaming; + trace.dictionaryID = cctx->dictID; + trace.dictionarySize = cctx->dictContentSize; + trace.uncompressedSize = cctx->consumedSrcSize; + trace.compressedSize = cctx->producedCSize + extraCSize; + trace.params = &cctx->appliedParams; + trace.cctx = cctx; + ZSTD_trace_compress_end(cctx->traceCtx, &trace); + } + cctx->traceCtx = 0; +#else + (void)cctx; + (void)extraCSize; +#endif +} + size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) @@ -3493,25 +4660,10 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, (unsigned)cctx->pledgedSrcSizePlusOne-1, (unsigned)cctx->consumedSrcSize); } + ZSTD_CCtx_trace(cctx, endResult); return cSize + endResult; } -static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const void* dict,size_t dictSize, - const ZSTD_parameters* params) -{ - ZSTD_CCtx_params const cctxParams = - ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params); - DEBUGLOG(4, "ZSTD_compress_internal"); - return ZSTD_compress_advanced_internal(cctx, - dst, dstCapacity, - src, srcSize, - dict, dictSize, - &cctxParams); -} - size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, @@ -3520,11 +4672,12 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, { DEBUGLOG(4, "ZSTD_compress_advanced"); FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); - return ZSTD_compress_internal(cctx, - dst, dstCapacity, - src, srcSize, - dict, dictSize, - ¶ms); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, ZSTD_NO_CLEVEL); + return ZSTD_compress_advanced_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + &cctx->simpleApiParams); } /* Internal */ @@ -3548,11 +4701,13 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); - ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + { + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); + assert(params.fParams.contentSizeFlag == 1); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); + } DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); - assert(params.fParams.contentSizeFlag == 1); - return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctx->simpleApiParams); } size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, @@ -3596,7 +4751,10 @@ size_t ZSTD_estimateCDictSize_advanced( DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) - + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small + * in case we are using DDS with row-hash. */ + + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_urm_auto, &cParams), + /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } @@ -3627,9 +4785,6 @@ static size_t ZSTD_initCDict_internal( assert(!ZSTD_checkCParams(params.cParams)); cdict->matchState.cParams = params.cParams; cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch; - if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE_MAX) { - cdict->matchState.dedicatedDictSearch = 0; - } if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { cdict->dictContent = dictBuffer; } else { @@ -3650,6 +4805,7 @@ static size_t ZSTD_initCDict_internal( &cdict->matchState, &cdict->workspace, ¶ms.cParams, + params.useRowMatchFinder, ZSTDcrp_makeClean, ZSTDirp_reset, ZSTD_resetTarget_CDict), ""); @@ -3673,14 +4829,17 @@ static size_t ZSTD_initCDict_internal( static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, - ZSTD_compressionParameters cParams, ZSTD_customMem customMem) + ZSTD_compressionParameters cParams, + ZSTD_useRowMatchFinderMode_e useRowMatchFinder, + U32 enableDedicatedDictSearch, + ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; { size_t const workspaceSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + - ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); @@ -3698,8 +4857,8 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, assert(cdict != NULL); ZSTD_cwksp_move(&cdict->workspace, &ws); cdict->customMem = customMem; - cdict->compressionLevel = 0; /* signals advanced API usage */ - + cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */ + cdict->useRowMatchFinder = useRowMatchFinder; return cdict; } } @@ -3751,10 +4910,13 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); } + DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch); cctxParams.cParams = cParams; + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); cdict = ZSTD_createCDict_advanced_internal(dictSize, dictLoadMethod, cctxParams.cParams, + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, customMem); if (ZSTD_isError( ZSTD_initCDict_internal(cdict, @@ -3823,7 +4985,9 @@ const ZSTD_CDict* ZSTD_initStaticCDict( ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams) { - size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); + ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_urm_auto, &cParams); + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) @@ -3848,6 +5012,8 @@ const ZSTD_CDict* ZSTD_initStaticCDict( ZSTD_CCtxParams_init(¶ms, 0); params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, @@ -3874,62 +5040,88 @@ unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict) return cdict->dictID; } - -/* ZSTD_compressBegin_usingCDict_advanced() : - * cdict must be != NULL */ -size_t ZSTD_compressBegin_usingCDict_advanced( +/* ZSTD_compressBegin_usingCDict_internal() : + * Implementation of various ZSTD_compressBegin_usingCDict* functions. + */ +static size_t ZSTD_compressBegin_usingCDict_internal( ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) { - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); + ZSTD_CCtx_params cctxParams; + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal"); RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); - { ZSTD_CCtx_params params = cctx->requestedParams; + /* Initialize the cctxParams from the cdict */ + { + ZSTD_parameters params; + params.fParams = fParams; params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN - || cdict->compressionLevel == 0 ) - && (params.attachDictPref != ZSTD_dictForceLoad) ? + || cdict->compressionLevel == 0 ) ? ZSTD_getCParamsFromCDict(cdict) : ZSTD_getCParams(cdict->compressionLevel, pledgedSrcSize, cdict->dictContentSize); - /* Increase window log to fit the entire dictionary and source if the - * source size is known. Limit the increase to 19, which is the - * window log for compression level 1 with the largest source size. - */ - if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { - U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); - U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; - params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog); - } - params.fParams = fParams; - return ZSTD_compressBegin_internal(cctx, - NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, - cdict, - ¶ms, pledgedSrcSize, - ZSTDb_not_buffered); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, cdict->compressionLevel); + } + /* Increase window log to fit the entire dictionary and source if the + * source size is known. Limit the increase to 19, which is the + * window log for compression level 1 with the largest source size. + */ + if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); + U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; + cctxParams.cParams.windowLog = MAX(cctxParams.cParams.windowLog, limitedSrcLog); } + return ZSTD_compressBegin_internal(cctx, + NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, + cdict, + &cctxParams, pledgedSrcSize, + ZSTDb_not_buffered); +} + + +/* ZSTD_compressBegin_usingCDict_advanced() : + * This function is DEPRECATED. + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pledgedSrcSize); } /* ZSTD_compressBegin_usingCDict() : - * pledgedSrcSize=0 means "unknown" - * if pledgedSrcSize>0, it will enable contentSizeFlag */ + * cdict must be != NULL */ size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); - return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); } -size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, +/*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) { - FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); } +/*! ZSTD_compress_usingCDict_advanced(): + * This function is DEPRECATED. + */ +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +} + /*! ZSTD_compress_usingCDict() : * Compression using a digested Dictionary. * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. @@ -3941,7 +5133,7 @@ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); } @@ -4071,7 +5263,7 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); - zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, ¶ms); + ZSTD_CCtxParams_setZstdParams(&zcs->requestedParams, ¶ms); FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); return 0; } @@ -4351,8 +5543,13 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ - if (cctx->cdict) - params.compressionLevel = cctx->cdict->compressionLevel; /* let cdict take priority in terms of compression level */ + if (cctx->cdict && !cctx->localDict.cdict) { + /* Let the cdict's compression level take priority over the requested params. + * But do not take the cdict's compression level if the "cdict" is actually a localDict + * generated from ZSTD_initLocalDict(). + */ + params.compressionLevel = cctx->cdict->compressionLevel; + } DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ { @@ -4371,11 +5568,21 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, params.ldmParams.enableLdm = 1; } + if (ZSTD_CParams_useBlockSplitter(¶ms.cParams)) { + DEBUGLOG(4, "Block splitter enabled by default (window size >= 128K, strategy >= btopt)"); + params.splitBlocks = 1; + } + + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); + #ifdef ZSTD_MULTITHREAD if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */ } if (params.nbWorkers > 0) { +#if ZSTD_TRACE + cctx->traceCtx = (ZSTD_trace_compress_begin != NULL) ? ZSTD_trace_compress_begin(cctx) : 0; +#endif /* mt context creation */ if (cctx->mtctx == NULL) { DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u", @@ -4389,6 +5596,10 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, cctx->mtctx, prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , ""); + cctx->dictID = cctx->cdict ? cctx->cdict->dictID : 0; + cctx->dictContentSize = cctx->cdict ? cctx->cdict->dictContentSize : prefixDict.dictSize; + cctx->consumedSrcSize = 0; + cctx->producedCSize = 0; cctx->streamStage = zcss_load; cctx->appliedParams = params; } else @@ -4450,8 +5661,12 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, size_t const ipos = input->pos; size_t const opos = output->pos; flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp); + cctx->consumedSrcSize += (U64)(input->pos - ipos); + cctx->producedCSize += (U64)(output->pos - opos); if ( ZSTD_isError(flushMin) || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */ + if (flushMin == 0) + ZSTD_CCtx_trace(cctx, 0); ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); } FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed"); @@ -4632,6 +5847,13 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZS return 0; } +static size_t ZSTD_convertBlockSequencesToSeqStore(ZSTD_CCtx* cctx, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) { + ZSTD_sequencePosition dummySeqPos = {0, 0, 0}; + return ZSTD_copySequencesToSeqStoreExplicitBlockDelim(cctx, &dummySeqPos, inSeqs, inSeqsSize, src, srcSize); +} + /* Returns the number of bytes to move the current read position back by. Only non-zero * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something * went wrong. @@ -4834,7 +6056,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, continue; } - compressedSeqsSize = ZSTD_entropyCompressSequences(&cctx->seqStore, + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, &cctx->appliedParams, op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, @@ -4866,7 +6088,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, } else { U32 cBlockHeader; /* Error checking and repcodes update */ - ZSTD_confirmRepcodesAndEntropyTables(cctx); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; @@ -4967,6 +6189,7 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) #define ZSTD_MAX_CLEVEL 22 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } +int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; } static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { { /* "default" - for any srcSize > 256 KB */ @@ -4976,15 +6199,15 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ - { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */ - { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */ - { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */ - { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ - { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ - { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ - { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ - { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ - { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */ + { 21, 18, 19, 4, 5, 2, ZSTD_greedy }, /* level 5 */ + { 21, 19, 20, 5, 5, 4, ZSTD_greedy }, /* level 6 */ + { 21, 19, 20, 4, 5, 8, ZSTD_lazy }, /* level 7 */ + { 21, 19, 20, 5, 5, 16, ZSTD_lazy }, /* level 8 */ + { 21, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ + { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ + { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 11 */ + { 22, 21, 22, 6, 5, 32, ZSTD_lazy2 }, /* level 12 */ + { 22, 22, 22, 4, 5, 32, ZSTD_btlazy2 }, /* level 13 */ { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ @@ -5001,8 +6224,8 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ - { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/ - { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/ + { 18, 16, 17, 3, 5, 2, ZSTD_greedy }, /* level 4.*/ + { 18, 17, 18, 5, 5, 2, ZSTD_greedy }, /* level 5.*/ { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ @@ -5029,11 +6252,11 @@ static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEV { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ - { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ - { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ - { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ - { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ - { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 17, 16, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ + { 17, 16, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 17, 16, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 17, 16, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 17, 16, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ @@ -5099,7 +6322,10 @@ static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const static int ZSTD_dedicatedDictSearch_isSupported( ZSTD_compressionParameters const* cParams) { - return (cParams->strategy >= ZSTD_greedy) && (cParams->strategy <= ZSTD_lazy2); + return (cParams->strategy >= ZSTD_greedy) + && (cParams->strategy <= ZSTD_lazy2) + && (cParams->hashLog > cParams->chainLog) + && (cParams->chainLog <= 24); } /** @@ -5117,6 +6343,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( case ZSTD_lazy: case ZSTD_lazy2: cParams->hashLog -= ZSTD_LAZY_DDSS_BUCKET_LOG; + if (cParams->hashLog < ZSTD_HASHLOG_MIN) { + cParams->hashLog = ZSTD_HASHLOG_MIN; + } break; case ZSTD_btlazy2: case ZSTD_btopt: @@ -5165,6 +6394,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, else row = compressionLevel; { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; + DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy); /* acceleration factor */ if (compressionLevel < 0) { int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel); diff --git a/zstd_compress_internal.h b/zstd_compress_internal.h index 80d150a..f24da24 100644 --- a/zstd_compress_internal.h +++ b/zstd_compress_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -63,7 +63,7 @@ typedef struct { } ZSTD_localDict; typedef struct { - HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)]; HUF_repeat repeatMode; } ZSTD_hufCTables_t; @@ -81,6 +81,53 @@ typedef struct { ZSTD_fseCTables_t fse; } ZSTD_entropyCTables_t; +/*********************************************** +* Entropy buffer statistics structs and funcs * +***********************************************/ +/** ZSTD_hufCTablesMetadata_t : + * Stores Literals Block Type for a super-block in hType, and + * huffman tree description in hufDesBuffer. + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ +typedef struct { + symbolEncodingType_e hType; + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; + size_t hufDesSize; +} ZSTD_hufCTablesMetadata_t; + +/** ZSTD_fseCTablesMetadata_t : + * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and + * fse tables in fseTablesBuffer. + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ +typedef struct { + symbolEncodingType_e llType; + symbolEncodingType_e ofType; + symbolEncodingType_e mlType; + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_fseCTablesMetadata_t; + +typedef struct { + ZSTD_hufCTablesMetadata_t hufMetadata; + ZSTD_fseCTablesMetadata_t fseMetadata; +} ZSTD_entropyCTablesMetadata_t; + +/** ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize); + +/********************************* +* Compression internals structs * +*********************************/ + typedef struct { U32 off; /* Offset code (offset + ZSTD_REP_MOVE) for the match */ U32 len; /* Raw length of match */ @@ -141,14 +188,23 @@ typedef struct { } ZSTD_compressedBlockState_t; typedef struct { - BYTE const* nextSrc; /* next block here to continue on current prefix */ - BYTE const* base; /* All regular indexes relative to this position */ - BYTE const* dictBase; /* extDict indexes relative to this position */ - U32 dictLimit; /* below that point, need extDict */ - U32 lowLimit; /* below that point, no more valid data */ + BYTE const* nextSrc; /* next block here to continue on current prefix */ + BYTE const* base; /* All regular indexes relative to this position */ + BYTE const* dictBase; /* extDict indexes relative to this position */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more valid data */ + U32 nbOverflowCorrections; /* Number of times overflow correction has run since + * ZSTD_window_init(). Useful for debugging coredumps + * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY. + */ } ZSTD_window_t; +#define ZSTD_WINDOW_START_INDEX 2 + typedef struct ZSTD_matchState_t ZSTD_matchState_t; + +#define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */ + struct ZSTD_matchState_t { ZSTD_window_t window; /* State for window round buffer management */ U32 loadedDictEnd; /* index of end of dictionary, within context's referential. @@ -160,9 +216,17 @@ struct ZSTD_matchState_t { */ U32 nextToUpdate; /* index from which to continue table update */ U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ + U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ + U32* hashTable; U32* hashTable3; U32* chainTable; + + U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ + int dedicatedDictSearch; /* Indicates whether this matchState is using the * dedicated dictionary search structure. */ @@ -183,13 +247,22 @@ typedef struct { U32 checksum; } ldmEntry_t; +typedef struct { + BYTE const* split; + U32 hash; + U32 checksum; + ldmEntry_t* bucket; +} ldmMatchCandidate_t; + +#define LDM_BATCH_SIZE 64 + typedef struct { ZSTD_window_t window; /* State for the window round buffer management */ ldmEntry_t* hashTable; U32 loadedDictEnd; BYTE* bucketOffsets; /* Next position in bucket to insert entry */ - U64 hashPower; /* Used to compute the rolling hash. - * Depends on ldmParams.minMatchLength */ + size_t splitIndices[LDM_BATCH_SIZE]; + ldmMatchCandidate_t matchCandidates[LDM_BATCH_SIZE]; } ldmState_t; typedef struct { @@ -246,6 +319,15 @@ struct ZSTD_CCtx_params_s { ZSTD_sequenceFormat_e blockDelimiters; int validateSequences; + /* Block splitting */ + int splitBlocks; + + /* Param for deciding whether to use row-based matchfinder */ + ZSTD_useRowMatchFinderMode_e useRowMatchFinder; + + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ + int deterministicRefPrefix; + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ ZSTD_customMem customMem; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ @@ -269,7 +351,9 @@ struct ZSTD_CCtx_s { int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ ZSTD_CCtx_params requestedParams; ZSTD_CCtx_params appliedParams; + ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simple API - not sticky. Must only be used in top-level simple API functions for storage. */ U32 dictID; + size_t dictContentSize; ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */ size_t blockSize; @@ -321,6 +405,11 @@ struct ZSTD_CCtx_s { #ifdef ZSTD_MULTITHREAD ZSTDMT_CCtx* mtctx; #endif + + /* Tracing */ +#if ZSTD_TRACE + ZSTD_TraceCtx traceCtx; +#endif }; typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; @@ -355,7 +444,7 @@ typedef enum { typedef size_t (*ZSTD_blockCompressor) ( ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode); +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_useRowMatchFinderMode_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); MEM_STATIC U32 ZSTD_LLcode(U32 litLength) @@ -532,8 +621,8 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera /* literal Length */ if (litLength>0xFFFF) { - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ - seqStorePtr->longLengthID = 1; + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ + seqStorePtr->longLengthType = ZSTD_llt_literalLength; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); } seqStorePtr->sequences[0].litLength = (U16)litLength; @@ -543,8 +632,8 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera /* match Length */ if (mlBase>0xFFFF) { - assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ - seqStorePtr->longLengthID = 2; + assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ + seqStorePtr->longLengthType = ZSTD_llt_matchLength; seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); } seqStorePtr->sequences[0].matchLength = (U16)mlBase; @@ -795,6 +884,13 @@ MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window) window->dictLimit = end; } +MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window) +{ + return window.dictLimit == ZSTD_WINDOW_START_INDEX && + window.lowLimit == ZSTD_WINDOW_START_INDEX && + (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX; +} + /** * ZSTD_window_hasExtDict(): * Returns non-zero if the window has a non-empty extDict. @@ -818,15 +914,69 @@ MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) ZSTD_noDict; } +/* Defining this macro to non-zero tells zstd to run the overflow correction + * code much more frequently. This is very inefficient, and should only be + * used for tests and fuzzers. + */ +#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY +# ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1 +# else +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0 +# endif +#endif + +/** + * ZSTD_window_canOverflowCorrect(): + * Returns non-zero if the indices are large enough for overflow correction + * to work correctly without impacting compression ratio. + */ +MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src) +{ + U32 const cycleSize = 1u << cycleLog; + U32 const curr = (U32)((BYTE const*)src - window.base); + U32 const minIndexToOverflowCorrect = cycleSize + MAX(maxDist, cycleSize); + + /* Adjust the min index to backoff the overflow correction frequency, + * so we don't waste too much CPU in overflow correction. If this + * computation overflows we don't really care, we just need to make + * sure it is at least minIndexToOverflowCorrect. + */ + U32 const adjustment = window.nbOverflowCorrections + 1; + U32 const adjustedIndex = MAX(minIndexToOverflowCorrect * adjustment, + minIndexToOverflowCorrect); + U32 const indexLargeEnough = curr > adjustedIndex; + + /* Only overflow correct early if the dictionary is invalidated already, + * so we don't hurt compression ratio. + */ + U32 const dictionaryInvalidated = curr > maxDist + loadedDictEnd; + + return indexLargeEnough && dictionaryInvalidated; +} + /** * ZSTD_window_needOverflowCorrection(): * Returns non-zero if the indices are getting too large and need overflow * protection. */ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src, void const* srcEnd) { U32 const curr = (U32)((BYTE const*)srcEnd - window.base); + if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, loadedDictEnd, src)) { + return 1; + } + } return curr > ZSTD_CURRENT_MAX; } @@ -838,7 +988,6 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, * * The least significant cycleLog bits of the indices must remain the same, * which may be 0. Every index up to maxDist in the past must be valid. - * NOTE: (maxDist & cycleMask) must be zero. */ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, U32 maxDist, void const* src) @@ -862,17 +1011,25 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, * 3. (cctx->lowLimit + 1< 3<<29 + 1<base); U32 const currentCycle0 = curr & cycleMask; /* Exclude zero so that newCurrent - maxDist >= 1. */ - U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0; - U32 const newCurrent = currentCycle1 + maxDist; + U32 const currentCycle1 = currentCycle0 == 0 ? cycleSize : currentCycle0; + U32 const newCurrent = currentCycle1 + MAX(maxDist, cycleSize); U32 const correction = curr - newCurrent; - assert((maxDist & cycleMask) == 0); + /* maxDist must be a power of two so that: + * (newCurrent & cycleMask) == (curr & cycleMask) + * This is required to not corrupt the chains / binary tree. + */ + assert((maxDist & (maxDist - 1)) == 0); + assert((curr & cycleMask) == (newCurrent & cycleMask)); assert(curr > newCurrent); - /* Loose bound, should be around 1<<29 (see above) */ - assert(correction > 1<<28); + if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + /* Loose bound, should be around 1<<29 (see above) */ + assert(correction > 1<<28); + } window->base += correction; window->dictBase += correction; @@ -888,6 +1045,8 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, assert(window->lowLimit <= newCurrent); assert(window->dictLimit <= newCurrent); + ++window->nbOverflowCorrections; + DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction, window->lowLimit); return correction; @@ -992,11 +1151,13 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { ZSTD_memset(window, 0, sizeof(*window)); - window->base = (BYTE const*)""; - window->dictBase = (BYTE const*)""; - window->dictLimit = 1; /* start from 1, so that 1st position is valid */ - window->lowLimit = 1; /* it ensures first and later CCtx usages compress the same */ - window->nextSrc = window->base + 1; /* see issue #1241 */ + window->base = (BYTE const*)" "; + window->dictBase = (BYTE const*)" "; + ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */ + window->dictLimit = ZSTD_WINDOW_START_INDEX; /* start from >0, so that 1st position is valid */ + window->lowLimit = ZSTD_WINDOW_START_INDEX; /* it ensures first and later CCtx usages compress the same */ + window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX; /* see issue #1241 */ + window->nbOverflowCorrections = 0; } /** @@ -1007,7 +1168,8 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { * Returns non-zero if the segment is contiguous. */ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, - void const* src, size_t srcSize) + void const* src, size_t srcSize, + int forceNonContiguous) { BYTE const* const ip = (BYTE const*)src; U32 contiguous = 1; @@ -1017,7 +1179,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, assert(window->base != NULL); assert(window->dictBase != NULL); /* Check if blocks follow each other */ - if (src != window->nextSrc) { + if (src != window->nextSrc || forceNonContiguous) { /* not contiguous */ size_t const distanceFromBase = (size_t)(window->nextSrc - window->base); DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit); @@ -1200,4 +1362,9 @@ size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSe * condition for correct operation : hashLog > 1 */ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); +/** ZSTD_CCtx_trace() : + * Trace the end of a compression call. + */ +void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + #endif /* ZSTD_COMPRESS_H */ diff --git a/zstd_compress_literals.c b/zstd_compress_literals.c index 6dd1c14..52b0a80 100644 --- a/zstd_compress_literals.c +++ b/zstd_compress_literals.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -15,7 +15,7 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize) { - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; U32 const flSize = 1 + (srcSize>31) + (srcSize>4095); RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); @@ -42,7 +42,7 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize) { - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; U32 const flSize = 1 + (srcSize>31) + (srcSize>4095); (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ @@ -73,7 +73,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, void* dst, size_t dstCapacity, const void* src, size_t srcSize, void* entropyWorkspace, size_t entropyWorkspaceSize, - const int bmi2) + const int bmi2, + unsigned suspectUncompressible) { size_t const minGain = ZSTD_minGain(srcSize, strategy); size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); @@ -105,11 +106,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, HUF_compress1X_repeat( ostart+lhSize, dstCapacity-lhSize, src, srcSize, HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : HUF_compress4X_repeat( ostart+lhSize, dstCapacity-lhSize, src, srcSize, HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); if (repeat != HUF_repeat_none) { /* reused the existing table */ DEBUGLOG(5, "Reusing previous huffman table"); @@ -117,7 +118,7 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, } } - if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { + if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); } diff --git a/zstd_compress_literals.h b/zstd_compress_literals.h index 8b08705..9775fb9 100644 --- a/zstd_compress_literals.h +++ b/zstd_compress_literals.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -18,12 +18,14 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); +/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, ZSTD_hufCTables_t* nextHuf, ZSTD_strategy strategy, int disableLiteralCompression, void* dst, size_t dstCapacity, const void* src, size_t srcSize, void* entropyWorkspace, size_t entropyWorkspaceSize, - const int bmi2); + const int bmi2, + unsigned suspectUncompressible); #endif /* ZSTD_COMPRESS_LITERALS_H */ diff --git a/zstd_compress_sequences.c b/zstd_compress_sequences.c index be30c08..44e1728 100644 --- a/zstd_compress_sequences.c +++ b/zstd_compress_sequences.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -85,6 +85,8 @@ static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t { unsigned cost = 0; unsigned s; + + assert(total > 0); for (s = 0; s <= max; ++s) { unsigned norm = (unsigned)((256 * count[s]) / total); if (count[s] != 0 && norm == 0) @@ -232,6 +234,11 @@ ZSTD_selectEncodingType( return set_compressed; } +typedef struct { + S16 norm[MaxSeq + 1]; + U32 wksp[FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(MaxSeq, MaxFSELog)]; +} ZSTD_BuildCTableWksp; + size_t ZSTD_buildCTable(void* dst, size_t dstCapacity, FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, @@ -258,7 +265,7 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity, FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), ""); /* note : could be pre-calculated */ return 0; case set_compressed: { - S16 norm[MaxSeq + 1]; + ZSTD_BuildCTableWksp* wksp = (ZSTD_BuildCTableWksp*)entropyWorkspace; size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); if (count[codeTable[nbSeq-1]] > 1) { @@ -266,11 +273,13 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity, nbSeq_1--; } assert(nbSeq_1 > 1); - assert(entropyWorkspaceSize >= FSE_BUILD_CTABLE_WORKSPACE_SIZE(MaxSeq, MaxFSELog)); - FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), ""); - { size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ + assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp)); + (void)entropyWorkspaceSize; + FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed"); + assert(oend >= op); + { size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog); /* overflow protected */ FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); - FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, entropyWorkspace, entropyWorkspaceSize), ""); + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed"); return NCountSize; } } diff --git a/zstd_compress_sequences.h b/zstd_compress_sequences.h index b884e45..df84886 100644 --- a/zstd_compress_sequences.h +++ b/zstd_compress_sequences.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_compress_superblock.c b/zstd_compress_superblock.c index c227e4a..ec96db9 100644 --- a/zstd_compress_superblock.c +++ b/zstd_compress_superblock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -15,288 +15,10 @@ #include "zstd_internal.h" /* ZSTD_getSequenceLength */ #include "hist.h" /* HIST_countFast_wksp */ -#include "zstd_compress_internal.h" +#include "zstd_compress_internal.h" /* ZSTD_[huf|fse|entropy]CTablesMetadata_t */ #include "zstd_compress_sequences.h" #include "zstd_compress_literals.h" -/*-************************************* -* Superblock entropy buffer structs -***************************************/ -/** ZSTD_hufCTablesMetadata_t : - * Stores Literals Block Type for a super-block in hType, and - * huffman tree description in hufDesBuffer. - * hufDesSize refers to the size of huffman tree description in bytes. - * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */ -typedef struct { - symbolEncodingType_e hType; - BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; - size_t hufDesSize; -} ZSTD_hufCTablesMetadata_t; - -/** ZSTD_fseCTablesMetadata_t : - * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and - * fse tables in fseTablesBuffer. - * fseTablesSize refers to the size of fse tables in bytes. - * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */ -typedef struct { - symbolEncodingType_e llType; - symbolEncodingType_e ofType; - symbolEncodingType_e mlType; - BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; - size_t fseTablesSize; - size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */ -} ZSTD_fseCTablesMetadata_t; - -typedef struct { - ZSTD_hufCTablesMetadata_t hufMetadata; - ZSTD_fseCTablesMetadata_t fseMetadata; -} ZSTD_entropyCTablesMetadata_t; - - -/** ZSTD_buildSuperBlockEntropy_literal() : - * Builds entropy for the super-block literals. - * Stores literals block type (raw, rle, compressed, repeat) and - * huffman description table to hufMetadata. - * @return : size of huffman description table or error code */ -static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize, - const ZSTD_hufCTables_t* prevHuf, - ZSTD_hufCTables_t* nextHuf, - ZSTD_hufCTablesMetadata_t* hufMetadata, - const int disableLiteralsCompression, - void* workspace, size_t wkspSize) -{ - BYTE* const wkspStart = (BYTE*)workspace; - BYTE* const wkspEnd = wkspStart + wkspSize; - BYTE* const countWkspStart = wkspStart; - unsigned* const countWksp = (unsigned*)workspace; - const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); - BYTE* const nodeWksp = countWkspStart + countWkspSize; - const size_t nodeWkspSize = wkspEnd-nodeWksp; - unsigned maxSymbolValue = 255; - unsigned huffLog = HUF_TABLELOG_DEFAULT; - HUF_repeat repeat = prevHuf->repeatMode; - - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize); - - /* Prepare nextEntropy assuming reusing the existing table */ - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - - if (disableLiteralsCompression) { - DEBUGLOG(5, "set_basic - disabled"); - hufMetadata->hType = set_basic; - return 0; - } - - /* small ? don't even attempt compression (speed opt) */ -# define COMPRESS_LITERALS_SIZE_MIN 63 - { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; - if (srcSize <= minLitSize) { - DEBUGLOG(5, "set_basic - too small"); - hufMetadata->hType = set_basic; - return 0; - } - } - - /* Scan input and build symbol stats */ - { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); - FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); - if (largest == srcSize) { - DEBUGLOG(5, "set_rle"); - hufMetadata->hType = set_rle; - return 0; - } - if (largest <= (srcSize >> 7)+4) { - DEBUGLOG(5, "set_basic - no gain"); - hufMetadata->hType = set_basic; - return 0; - } - } - - /* Validate the previous Huffman table */ - if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { - repeat = HUF_repeat_none; - } - - /* Build Huffman Tree */ - ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); - huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); - { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, - maxSymbolValue, huffLog, - nodeWksp, nodeWkspSize); - FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); - huffLog = (U32)maxBits; - { /* Build and write the CTable */ - size_t const newCSize = HUF_estimateCompressedSize( - (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); - size_t const hSize = HUF_writeCTable( - hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), - (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog); - /* Check against repeating the previous CTable */ - if (repeat != HUF_repeat_none) { - size_t const oldCSize = HUF_estimateCompressedSize( - (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); - if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { - DEBUGLOG(5, "set_repeat - smaller"); - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - hufMetadata->hType = set_repeat; - return 0; - } - } - if (newCSize + hSize >= srcSize) { - DEBUGLOG(5, "set_basic - no gains"); - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - hufMetadata->hType = set_basic; - return 0; - } - DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); - hufMetadata->hType = set_compressed; - nextHuf->repeatMode = HUF_repeat_check; - return hSize; - } - } -} - -/** ZSTD_buildSuperBlockEntropy_sequences() : - * Builds entropy for the super-block sequences. - * Stores symbol compression modes and fse table to fseMetadata. - * @return : size of fse tables or error code */ -static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr, - const ZSTD_fseCTables_t* prevEntropy, - ZSTD_fseCTables_t* nextEntropy, - const ZSTD_CCtx_params* cctxParams, - ZSTD_fseCTablesMetadata_t* fseMetadata, - void* workspace, size_t wkspSize) -{ - BYTE* const wkspStart = (BYTE*)workspace; - BYTE* const wkspEnd = wkspStart + wkspSize; - BYTE* const countWkspStart = wkspStart; - unsigned* const countWksp = (unsigned*)workspace; - const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned); - BYTE* const cTableWksp = countWkspStart + countWkspSize; - const size_t cTableWkspSize = wkspEnd-cTableWksp; - ZSTD_strategy const strategy = cctxParams->cParams.strategy; - FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; - FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; - FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; - const BYTE* const ofCodeTable = seqStorePtr->ofCode; - const BYTE* const llCodeTable = seqStorePtr->llCode; - const BYTE* const mlCodeTable = seqStorePtr->mlCode; - size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; - BYTE* const ostart = fseMetadata->fseTablesBuffer; - BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); - BYTE* op = ostart; - - assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE)); - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq); - ZSTD_memset(workspace, 0, wkspSize); - - fseMetadata->lastCountSize = 0; - /* convert length/distances into codes */ - ZSTD_seqToCodes(seqStorePtr); - /* build CTable for Literal Lengths */ - { U32 LLtype; - unsigned max = MaxLL; - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - DEBUGLOG(5, "Building LL table"); - nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; - LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, - countWksp, max, mostFrequent, nbSeq, - LLFSELog, prevEntropy->litlengthCTable, - LL_defaultNorm, LL_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(set_basic < set_compressed && set_rle < set_compressed); - assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, - countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, - prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), - cTableWksp, cTableWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); - if (LLtype == set_compressed) - fseMetadata->lastCountSize = countSize; - op += countSize; - fseMetadata->llType = (symbolEncodingType_e) LLtype; - } } - /* build CTable for Offsets */ - { U32 Offtype; - unsigned max = MaxOff; - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ - ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; - DEBUGLOG(5, "Building OF table"); - nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; - Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, - countWksp, max, mostFrequent, nbSeq, - OffFSELog, prevEntropy->offcodeCTable, - OF_defaultNorm, OF_defaultNormLog, - defaultPolicy, strategy); - assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, - countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, - prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), - cTableWksp, cTableWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); - if (Offtype == set_compressed) - fseMetadata->lastCountSize = countSize; - op += countSize; - fseMetadata->ofType = (symbolEncodingType_e) Offtype; - } } - /* build CTable for MatchLengths */ - { U32 MLtype; - unsigned max = MaxML; - size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); - nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; - MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, - countWksp, max, mostFrequent, nbSeq, - MLFSELog, prevEntropy->matchlengthCTable, - ML_defaultNorm, ML_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, - countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, - prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), - cTableWksp, cTableWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); - if (MLtype == set_compressed) - fseMetadata->lastCountSize = countSize; - op += countSize; - fseMetadata->mlType = (symbolEncodingType_e) MLtype; - } } - assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer)); - return op-ostart; -} - - -/** ZSTD_buildSuperBlockEntropy() : - * Builds entropy for the super-block. - * @return : 0 on success or error code */ -static size_t -ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, - const ZSTD_entropyCTables_t* prevEntropy, - ZSTD_entropyCTables_t* nextEntropy, - const ZSTD_CCtx_params* cctxParams, - ZSTD_entropyCTablesMetadata_t* entropyMetadata, - void* workspace, size_t wkspSize) -{ - size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy"); - entropyMetadata->hufMetadata.hufDesSize = - ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize, - &prevEntropy->huf, &nextEntropy->huf, - &entropyMetadata->hufMetadata, - ZSTD_disableLiteralsCompression(cctxParams), - workspace, wkspSize); - FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed"); - entropyMetadata->fseMetadata.fseTablesSize = - ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr, - &prevEntropy->fse, &nextEntropy->fse, - cctxParams, - &entropyMetadata->fseMetadata, - workspace, wkspSize); - FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed"); - return 0; -} - /** ZSTD_compressSubBlock_literal() : * Compresses literals section for a sub-block. * When we have to write the Huffman table we will sometimes choose a header @@ -304,7 +26,7 @@ ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, * before we know the table size + compressed size, so we have a bound on the * table size. If we guessed incorrectly, we fall back to uncompressed literals. * - * We write the header when writeEntropy=1 and set entropyWrriten=1 when we succeeded + * We write the header when writeEntropy=1 and set entropyWritten=1 when we succeeded * in writing the header, otherwise it is set to 0. * * hufMetadata->hType has literals block type info. @@ -643,8 +365,9 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, void* workspace, size_t wkspSize, int writeEntropy) { - size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + size_t const sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ size_t cSeqSizeEstimate = 0; + if (nbSeq == 0) return sequencesSectionHeaderSize; cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff, nbSeq, fseTables->offcodeCTable, NULL, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, @@ -830,7 +553,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, unsigned lastBlock) { ZSTD_entropyCTablesMetadata_t entropyMetadata; - FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore, + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, diff --git a/zstd_compress_superblock.h b/zstd_compress_superblock.h index 35d2072..ab549c2 100644 --- a/zstd_compress_superblock.h +++ b/zstd_compress_superblock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_cwksp.h b/zstd_cwksp.h index ffb3a8d..e02829e 100644 --- a/zstd_cwksp.h +++ b/zstd_cwksp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -35,6 +35,10 @@ extern "C" { #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128 #endif + +/* Set our tables and aligneds to align by 64 bytes */ +#define ZSTD_CWKSP_ALIGNMENT_BYTES 64 + /*-************************************* * Structures ***************************************/ @@ -117,10 +121,11 @@ typedef enum { * - Tables: these are any of several different datastructures (hash tables, * chain tables, binary trees) that all respect a common format: they are * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). - * Their sizes depend on the cparams. + * Their sizes depend on the cparams. These tables are 64-byte aligned. * * - Aligned: these buffers are used for various purposes that require 4 byte - * alignment, but don't require any initialization before they're used. + * alignment, but don't require any initialization before they're used. These + * buffers are each aligned to 64 bytes. * * - Buffers: these buffers are used for various purposes that don't require * any alignment or initialization before they're used. This means they can @@ -133,8 +138,7 @@ typedef enum { * * 1. Objects * 2. Buffers - * 3. Aligned - * 4. Tables + * 3. Aligned/Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -187,6 +191,8 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { * Since tables aren't currently redzoned, you don't need to call through this * to figure out how much space you need for the matchState tables. Everything * else is though. + * + * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size(). */ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { if (size == 0) @@ -198,30 +204,110 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { #endif } -MEM_STATIC void ZSTD_cwksp_internal_advance_phase( +/** + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. + * Used to determine the number of bytes required for a given "aligned". + */ +MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES)); +} + +/** + * Returns the amount of additional space the cwksp must allocate + * for internal purposes (currently only alignment). + */ +MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { + /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes + * to align the beginning of tables section, as well as another n_2=[0, 63] bytes + * to align the beginning of the aligned secion. + * + * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and + * aligneds being sized in multiples of 64 bytes. + */ + size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; + return slackSpace; +} + + +/** + * Return the number of additional bytes required to align a pointer to the given number of bytes. + * alignBytes must be a power of two. + */ +MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert((alignBytes & alignBytesMask) == 0); + assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); + return bytes; +} + +/** + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, which + * counts from the end of the wksp. (as opposed to the object/table segment) + * + * Returns a pointer to the beginning of that space. + */ +MEM_STATIC void* ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) { + void* const alloc = (BYTE*)ws->allocStart - bytes; + void* const bottom = ws->tableEnd; + DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >= bottom); + if (alloc < bottom) { + DEBUGLOG(4, "cwksp: alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + if (alloc < ws->tableValidEnd) { + ws->tableValidEnd = alloc; + } + ws->allocStart = alloc; + return alloc; +} + +/** + * Moves the cwksp to the next phase, and does any necessary allocations. + * Returns a 0 on success, or zstd error + */ +MEM_STATIC size_t ZSTD_cwksp_internal_advance_phase( ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { assert(phase >= ws->phase); if (phase > ws->phase) { + /* Going from allocating objects to allocating buffers */ if (ws->phase < ZSTD_cwksp_alloc_buffers && phase >= ZSTD_cwksp_alloc_buffers) { ws->tableValidEnd = ws->objectEnd; } + + /* Going from allocating buffers to allocating aligneds/tables */ if (ws->phase < ZSTD_cwksp_alloc_aligned && phase >= ZSTD_cwksp_alloc_aligned) { - /* If unaligned allocations down from a too-large top have left us - * unaligned, we need to realign our alloc ptr. Technically, this - * can consume space that is unaccounted for in the neededSpace - * calculation. However, I believe this can only happen when the - * workspace is too large, and specifically when it is too large - * by a larger margin than the space that will be consumed. */ - /* TODO: cleaner, compiler warning friendly way to do this??? */ - ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1)); - if (ws->allocStart < ws->tableValidEnd) { - ws->tableValidEnd = ws->allocStart; + { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ + size_t const bytesToAlign = + ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); + DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); + ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ + RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), + memory_allocation, "aligned phase - alignment initial allocation failed!"); + } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ + void* const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); + void* const end = (BYTE*)alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(end > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); + ws->objectEnd = end; + ws->tableEnd = end; + ws->tableValidEnd = end; } } ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); } + return 0; } /** @@ -237,38 +323,25 @@ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { MEM_STATIC void* ZSTD_cwksp_reserve_internal( ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { void* alloc; - void* bottom = ws->tableEnd; - ZSTD_cwksp_internal_advance_phase(ws, phase); - alloc = (BYTE *)ws->allocStart - bytes; - - if (bytes == 0) + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) { return NULL; + } #if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) /* over-reserve space */ - alloc = (BYTE *)alloc - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; + bytes += 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; #endif - DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", - alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); - ZSTD_cwksp_assert_internal_consistency(ws); - assert(alloc >= bottom); - if (alloc < bottom) { - DEBUGLOG(4, "cwksp: alloc failed!"); - ws->allocFailed = 1; - return NULL; - } - if (alloc < ws->tableValidEnd) { - ws->tableValidEnd = alloc; - } - ws->allocStart = alloc; + alloc = ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes); #if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on * either size. */ - alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; - if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) { - __asan_unpoison_memory_region(alloc, bytes); + if (alloc) { + alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; + if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) { + __asan_unpoison_memory_region(alloc, bytes); + } } #endif @@ -283,28 +356,36 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { } /** - * Reserves and returns memory sized on and aligned on sizeof(unsigned). + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). */ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { - assert((bytes & (sizeof(U32)-1)) == 0); - return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned); + void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), + ZSTD_cwksp_alloc_aligned); + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); + return ptr; } /** - * Aligned on sizeof(unsigned). These buffers have the special property that + * Aligned on 64 bytes. These buffers have the special property that * their values remain constrained, allowing us to re-use them without * memset()-ing them. */ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; - void* alloc = ws->tableEnd; - void* end = (BYTE *)alloc + bytes; - void* top = ws->allocStart; + void* alloc; + void* end; + void* top; + + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { + return NULL; + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; + top = ws->allocStart; DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining", alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); assert((bytes & (sizeof(U32)-1)) == 0); - ZSTD_cwksp_internal_advance_phase(ws, phase); ZSTD_cwksp_assert_internal_consistency(ws); assert(end <= top); if (end > top) { @@ -320,6 +401,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { } #endif + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); return alloc; } @@ -503,7 +586,7 @@ MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) { /** * Moves the management of a workspace from one cwksp to another. The src cwksp - * is left in an invalid state (src must be re-init()'ed before its used again). + * is left in an invalid state (src must be re-init()'ed before it's used again). */ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { *dst = *src; @@ -527,6 +610,24 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { * Functions Checking Free Space ***************************************/ +/* ZSTD_alignmentSpaceWithinBounds() : + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, + size_t const estimatedSpace, int resizedWorkspace) { + if (resizedWorkspace) { + /* Resized/newly allocated wksp should have exact bounds */ + return ZSTD_cwksp_used(ws) == estimatedSpace; + } else { + /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes + * than estimatedSpace. See the comments in zstd_cwksp.h for details. + */ + return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); + } +} + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd); } diff --git a/zstd_ddict.c b/zstd_ddict.c index b90f27b..f1ca935 100644 --- a/zstd_ddict.c +++ b/zstd_ddict.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_ddict.h b/zstd_ddict.h index b427f9a..a6bec10 100644 --- a/zstd_ddict.h +++ b/zstd_ddict.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_decompress.c b/zstd_decompress.c index be77d07..f93a83f 100644 --- a/zstd_decompress.c +++ b/zstd_decompress.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -62,6 +62,7 @@ #include "fse.h" #define HUF_STATIC_LINKING_ONLY #include "huf.h" +#include "xxhash.h" /* XXH64_reset, XXH64_update, XXH64_digest, XXH64 */ #include "zstd_internal.h" /* blockProperties_t */ #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ @@ -72,6 +73,144 @@ #endif + +/************************************* + * Multiple DDicts Hashset internals * + *************************************/ + +#define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. + * Currently, that means a 0.75 load factor. + * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded + * the load factor of the ddict hash set. + */ + +#define DDICT_HASHSET_TABLE_BASE_SIZE 64 +#define DDICT_HASHSET_RESIZE_FACTOR 2 + +/* Hash function to determine starting position of dict insertion within the table + * Returns an index between [0, hashSet->ddictPtrTableSize] + */ +static size_t ZSTD_DDictHashSet_getIndex(const ZSTD_DDictHashSet* hashSet, U32 dictID) { + const U64 hash = XXH64(&dictID, sizeof(U32), 0); + /* DDict ptr table size is a multiple of 2, use size - 1 as mask to get index within [0, hashSet->ddictPtrTableSize) */ + return hash & (hashSet->ddictPtrTableSize - 1); +} + +/* Adds DDict to a hashset without resizing it. + * If inserting a DDict with a dictID that already exists in the set, replaces the one in the set. + * Returns 0 if successful, or a zstd error code if something went wrong. + */ +static size_t ZSTD_DDictHashSet_emplaceDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict) { + const U32 dictID = ZSTD_getDictID_fromDDict(ddict); + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + RETURN_ERROR_IF(hashSet->ddictPtrCount == hashSet->ddictPtrTableSize, GENERIC, "Hash set is full!"); + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + while (hashSet->ddictPtrTable[idx] != NULL) { + /* Replace existing ddict if inserting ddict with same dictID */ + if (ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]) == dictID) { + DEBUGLOG(4, "DictID already exists, replacing rather than adding"); + hashSet->ddictPtrTable[idx] = ddict; + return 0; + } + idx &= idxRangeMask; + idx++; + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + hashSet->ddictPtrTable[idx] = ddict; + hashSet->ddictPtrCount++; + return 0; +} + +/* Expands hash table by factor of DDICT_HASHSET_RESIZE_FACTOR and + * rehashes all values, allocates new table, frees old table. + * Returns 0 on success, otherwise a zstd error code. + */ +static size_t ZSTD_DDictHashSet_expand(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + size_t newTableSize = hashSet->ddictPtrTableSize * DDICT_HASHSET_RESIZE_FACTOR; + const ZSTD_DDict** newTable = (const ZSTD_DDict**)ZSTD_customCalloc(sizeof(ZSTD_DDict*) * newTableSize, customMem); + const ZSTD_DDict** oldTable = hashSet->ddictPtrTable; + size_t oldTableSize = hashSet->ddictPtrTableSize; + size_t i; + + DEBUGLOG(4, "Expanding DDict hash table! Old size: %zu new size: %zu", oldTableSize, newTableSize); + RETURN_ERROR_IF(!newTable, memory_allocation, "Expanded hashset allocation failed!"); + hashSet->ddictPtrTable = newTable; + hashSet->ddictPtrTableSize = newTableSize; + hashSet->ddictPtrCount = 0; + for (i = 0; i < oldTableSize; ++i) { + if (oldTable[i] != NULL) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, oldTable[i]), ""); + } + } + ZSTD_customFree((void*)oldTable, customMem); + DEBUGLOG(4, "Finished re-hash"); + return 0; +} + +/* Fetches a DDict with the given dictID + * Returns the ZSTD_DDict* with the requested dictID. If it doesn't exist, then returns NULL. + */ +static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, U32 dictID) { + size_t idx = ZSTD_DDictHashSet_getIndex(hashSet, dictID); + const size_t idxRangeMask = hashSet->ddictPtrTableSize - 1; + DEBUGLOG(4, "Hashed index: for dictID: %u is %zu", dictID, idx); + for (;;) { + size_t currDictID = ZSTD_getDictID_fromDDict(hashSet->ddictPtrTable[idx]); + if (currDictID == dictID || currDictID == 0) { + /* currDictID == 0 implies a NULL ddict entry */ + break; + } else { + idx &= idxRangeMask; /* Goes to start of table when we reach the end */ + idx++; + } + } + DEBUGLOG(4, "Final idx after probing for dictID %u is: %zu", dictID, idx); + return hashSet->ddictPtrTable[idx]; +} + +/* Allocates space for and returns a ddict hash set + * The hash set's ZSTD_DDict* table has all values automatically set to NULL to begin with. + * Returns NULL if allocation failed. + */ +static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) { + ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem); + DEBUGLOG(4, "Allocating new hash set"); + ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem); + ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; + ret->ddictPtrCount = 0; + if (!ret || !ret->ddictPtrTable) { + return NULL; + } + return ret; +} + +/* Frees the table of ZSTD_DDict* within a hashset, then frees the hashset itself. + * Note: The ZSTD_DDict* within the table are NOT freed. + */ +static void ZSTD_freeDDictHashSet(ZSTD_DDictHashSet* hashSet, ZSTD_customMem customMem) { + DEBUGLOG(4, "Freeing ddict hash set"); + if (hashSet && hashSet->ddictPtrTable) { + ZSTD_customFree((void*)hashSet->ddictPtrTable, customMem); + } + if (hashSet) { + ZSTD_customFree(hashSet, customMem); + } +} + +/* Public function: Adds a DDict into the ZSTD_DDictHashSet, possibly triggering a resize of the hash set. + * Returns 0 on success, or a ZSTD error. + */ +static size_t ZSTD_DDictHashSet_addDDict(ZSTD_DDictHashSet* hashSet, const ZSTD_DDict* ddict, ZSTD_customMem customMem) { + DEBUGLOG(4, "Adding dict ID: %u to hashset with - Count: %zu Tablesize: %zu", ZSTD_getDictID_fromDDict(ddict), hashSet->ddictPtrCount, hashSet->ddictPtrTableSize); + if (hashSet->ddictPtrCount * DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT / hashSet->ddictPtrTableSize * DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT != 0) { + FORWARD_IF_ERROR(ZSTD_DDictHashSet_expand(hashSet, customMem), ""); + } + FORWARD_IF_ERROR(ZSTD_DDictHashSet_emplaceDDict(hashSet, ddict), ""); + return 0; +} + /*-************************************************************* * Context management ***************************************************************/ @@ -101,6 +240,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; dctx->outBufferMode = ZSTD_bm_buffered; dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; } static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) @@ -120,8 +260,8 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) dctx->noForwardProgress = 0; dctx->oversizedDuration = 0; dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + dctx->ddictSet = NULL; ZSTD_DCtx_resetParameters(dctx); - dctx->validateChecksum = 1; #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION dctx->dictContentEndForFuzzing = NULL; #endif @@ -178,6 +318,10 @@ size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) if (dctx->legacyContext) ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion); #endif + if (dctx->ddictSet) { + ZSTD_freeDDictHashSet(dctx->ddictSet, cMem); + dctx->ddictSet = NULL; + } ZSTD_customFree(dctx, cMem); return 0; } @@ -190,6 +334,29 @@ void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) ZSTD_memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ } +/* Given a dctx with a digested frame params, re-selects the correct ZSTD_DDict based on + * the requested dict ID from the frame. If there exists a reference to the correct ZSTD_DDict, then + * accordingly sets the ddict to be used to decompress the frame. + * + * If no DDict is found, then no action is taken, and the ZSTD_DCtx::ddict remains as-is. + * + * ZSTD_d_refMultipleDDicts must be enabled for this function to be called. + */ +static void ZSTD_DCtx_selectFrameDDict(ZSTD_DCtx* dctx) { + assert(dctx->refMultipleDDicts && dctx->ddictSet); + DEBUGLOG(4, "Adjusting DDict based on requested dict ID from frame"); + if (dctx->ddict) { + const ZSTD_DDict* frameDDict = ZSTD_DDictHashSet_getDDict(dctx->ddictSet, dctx->fParams.dictID); + if (frameDDict) { + DEBUGLOG(4, "DDict found!"); + ZSTD_clearDict(dctx); + dctx->dictID = dctx->fParams.dictID; + dctx->ddict = frameDDict; + dctx->dictUses = ZSTD_use_indefinitely; + } + } +} + /*-************************************************************* * Frame header decoding @@ -213,6 +380,19 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) return 0; } +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + */ +unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } + return 0; +} + /** ZSTD_frameHeaderSize_internal() : * srcSize must be large enough to reach header size fields. * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. @@ -336,7 +516,6 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); } - /** ZSTD_getFrameContentSize() : * compatible with legacy mode * @return : decompressed size of the single frame pointed to be `src` if known, otherwise @@ -377,6 +556,37 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) } } +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, + const void* src, size_t srcSize) +{ + U32 const magicNumber = MEM_readLE32(src); + size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); + size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; + + /* check input validity */ + RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); + RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); + RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); + + /* deliver payload */ + if (skippableContentSize > 0 && dst != NULL) + ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); + if (magicVariant != NULL) + *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; + return skippableContentSize; +} + /** ZSTD_findDecompressedSize() : * compatible with legacy mode * `srcSize` must be the exact length of some number of ZSTD compressed and/or @@ -441,12 +651,19 @@ unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) /** ZSTD_decodeFrameHeader() : * `headerSize` must be the size provided by ZSTD_frameHeaderSize(). + * If multiple DDict references are enabled, also will choose the correct DDict to use. * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) { size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format); if (ZSTD_isError(result)) return result; /* invalid header */ RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small"); + + /* Reference DDict requested by frame if dctx references multiple ddicts */ + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts && dctx->ddictSet) { + ZSTD_DCtx_selectFrameDDict(dctx); + } + #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION /* Skip the dictID check in fuzzing mode, because it makes the search * harder. @@ -456,6 +673,7 @@ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t he #endif dctx->validateChecksum = (dctx->fParams.checksumFlag && !dctx->forceIgnoreChecksum) ? 1 : 0; if (dctx->validateChecksum) XXH64_reset(&dctx->xxhState, 0); + dctx->processedCSize += headerSize; return 0; } @@ -578,7 +796,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize) { DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize); - ZSTD_checkContinuity(dctx, blockStart); + ZSTD_checkContinuity(dctx, blockStart, blockSize); dctx->previousDstEnd = (const char*)blockStart + blockSize; return blockSize; } @@ -610,6 +828,32 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, return regenSize; } +static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming) +{ +#if ZSTD_TRACE + if (dctx->traceCtx && ZSTD_trace_decompress_end != NULL) { + ZSTD_Trace trace; + ZSTD_memset(&trace, 0, sizeof(trace)); + trace.version = ZSTD_VERSION_NUMBER; + trace.streaming = streaming; + if (dctx->ddict) { + trace.dictionaryID = ZSTD_getDictID_fromDDict(dctx->ddict); + trace.dictionarySize = ZSTD_DDict_dictSize(dctx->ddict); + trace.dictionaryIsCold = dctx->ddictIsCold; + } + trace.uncompressedSize = (size_t)uncompressedSize; + trace.compressedSize = (size_t)compressedSize; + trace.dctx = dctx; + ZSTD_trace_decompress_end(dctx->traceCtx, &trace); + } +#else + (void)dctx; + (void)uncompressedSize; + (void)compressedSize; + (void)streaming; +#endif +} + /*! ZSTD_decompressFrame() : * @dctx must be properly initialized @@ -619,8 +863,9 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void** srcPtr, size_t *srcSizePtr) { - const BYTE* ip = (const BYTE*)(*srcPtr); - BYTE* const ostart = (BYTE* const)dst; + const BYTE* const istart = (const BYTE*)(*srcPtr); + const BYTE* ip = istart; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart; BYTE* op = ostart; size_t remainingSrcSize = *srcSizePtr; @@ -695,7 +940,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, ip += 4; remainingSrcSize -= 4; } - + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); /* Allow caller to get size read */ *srcPtr = ip; *srcSizePtr = remainingSrcSize; @@ -764,7 +1009,7 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, * use this in all cases but ddict */ FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), ""); } - ZSTD_checkContinuity(dctx, dst); + ZSTD_checkContinuity(dctx, dst, dstCapacity); { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, &src, &srcSize); @@ -899,7 +1144,9 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize); /* Sanity check */ RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed"); - if (dstCapacity) ZSTD_checkContinuity(dctx, dst); + ZSTD_checkContinuity(dctx, dst, dstCapacity); + + dctx->processedCSize += srcSize; switch (dctx->stage) { @@ -1004,6 +1251,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c dctx->expected = 4; dctx->stage = ZSTDds_checkChecksum; } else { + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); dctx->expected = 0; /* ends here */ dctx->stage = ZSTDds_getFrameHeaderSize; } @@ -1023,6 +1271,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32); RETURN_ERROR_IF(check32 != h32, checksum_wrong, ""); } + ZSTD_DCtx_trace_end(dctx, dctx->decodedSize, dctx->processedCSize, /* streaming */ 1); dctx->expected = 0; dctx->stage = ZSTDds_getFrameHeaderSize; return 0; @@ -1176,8 +1425,12 @@ static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) { assert(dctx != NULL); +#if ZSTD_TRACE + dctx->traceCtx = (ZSTD_trace_decompress_begin != NULL) ? ZSTD_trace_decompress_begin(dctx) : 0; +#endif dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */ dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->processedCSize = 0; dctx->decodedSize = 0; dctx->previousDstEnd = NULL; dctx->prefixStart = NULL; @@ -1391,6 +1644,16 @@ size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) if (ddict) { dctx->ddict = ddict; dctx->dictUses = ZSTD_use_indefinitely; + if (dctx->refMultipleDDicts == ZSTD_rmd_refMultipleDDicts) { + if (dctx->ddictSet == NULL) { + dctx->ddictSet = ZSTD_createDDictHashSet(dctx->customMem); + if (!dctx->ddictSet) { + RETURN_ERROR(memory_allocation, "Failed to allocate memory for hash set!"); + } + } + assert(!dctx->staticSize); /* Impossible: ddictSet cannot have been allocated if static dctx */ + FORWARD_IF_ERROR(ZSTD_DDictHashSet_addDDict(dctx->ddictSet, ddict, dctx->customMem), ""); + } } return 0; } @@ -1436,6 +1699,10 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) bounds.lowerBound = (int)ZSTD_d_validateChecksum; bounds.upperBound = (int)ZSTD_d_ignoreChecksum; return bounds; + case ZSTD_d_refMultipleDDicts: + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; default:; } bounds.error = ERROR(parameter_unsupported); @@ -1473,6 +1740,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value case ZSTD_d_forceIgnoreChecksum: *value = (int)dctx->forceIgnoreChecksum; return 0; + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); @@ -1499,6 +1769,13 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value CHECK_DBOUNDS(ZSTD_d_forceIgnoreChecksum, value); dctx->forceIgnoreChecksum = (ZSTD_forceIgnoreChecksum_e)value; return 0; + case ZSTD_d_refMultipleDDicts: + CHECK_DBOUNDS(ZSTD_d_refMultipleDDicts, value); + if (dctx->staticSize != 0) { + RETURN_ERROR(parameter_unsupported, "Static dctx does not support multiple DDicts!"); + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); @@ -1680,6 +1957,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB } } #endif { size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format); + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } DEBUGLOG(5, "header size : %u", (U32)hSize); if (ZSTD_isError(hSize)) { #if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) diff --git a/zstd_decompress_block.c b/zstd_decompress_block.c index 19ddb7a..8818dea 100644 --- a/zstd_decompress_block.c +++ b/zstd_decompress_block.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -577,7 +577,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; int nbSeq; @@ -658,7 +658,6 @@ typedef struct { size_t litLength; size_t matchLength; size_t offset; - const BYTE* match; } seq_t; typedef struct { @@ -672,9 +671,6 @@ typedef struct { ZSTD_fseState stateOffb; ZSTD_fseState stateML; size_t prevOffset[ZSTD_REP_NUM]; - const BYTE* prefixStart; - const BYTE* dictEnd; - size_t pos; } seqState_t; /*! ZSTD_overlapCopy8() : @@ -909,20 +905,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS } FORCE_INLINE_TEMPLATE void -ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) { - ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; - U32 const nbBits = DInfo.nbBits; size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; -} - -FORCE_INLINE_TEMPLATE void -ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) -{ - U32 const nbBits = DInfo.nbBits; - size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; + DStatePtr->state = nextState + lowBits; } /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum @@ -936,116 +922,105 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD : 0) typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; -typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e; FORCE_INLINE_TEMPLATE seq_t -ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch) +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) { seq_t seq; - ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; - ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; - ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; - U32 const llBase = llDInfo.baseValue; - U32 const mlBase = mlDInfo.baseValue; - U32 const ofBase = ofDInfo.baseValue; - BYTE const llBits = llDInfo.nbAdditionalBits; - BYTE const mlBits = mlDInfo.nbAdditionalBits; - BYTE const ofBits = ofDInfo.nbAdditionalBits; - BYTE const totalBits = llBits+mlBits+ofBits; - - /* sequence */ - { size_t offset; - if (ofBits > 1) { - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); - assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { - U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - BIT_reloadDStream(&seqState->DStream); - if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); - assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ - } else { - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } else { - U32 const ll0 = (llBase == 0); - if (LIKELY((ofBits == 0))) { - if (LIKELY(!ll0)) - offset = seqState->prevOffset[0]; - else { - offset = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + /* + * As gcc has better branch and block analyzers, sometimes it is only + * valuable to mark likelyness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; + #if defined(__clang__) + if (LIKELY(ofBits > 1)) { + #else + if (ofBits > 1) { + #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; } else { - offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); - { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } } } - seq.offset = offset; - } - - seq.matchLength = mlBase; - if (mlBits > 0) - seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); - - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) - BIT_reloadDStream(&seqState->DStream); - if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - - seq.litLength = llBase; - if (llBits > 0) - seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); - - if (MEM_32bits()) - BIT_reloadDStream(&seqState->DStream); - - DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - - if (prefetch == ZSTD_p_prefetch) { - size_t const pos = seqState->pos + seq.litLength; - const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart; - seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. - * No consequence though : no memory access will occur, offset is only used for prefetching */ - seqState->pos = pos + seq.matchLength; - } - - /* ANS state update - * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). - * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). - * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the - * better option, so it is the default for other compilers. But, if you - * measure that it is worse, please put up a pull request. - */ - { -#if defined(__GNUC__) && !defined(__clang__) - const int kUseUpdateFseState = 1; -#else - const int kUseUpdateFseState = 0; -#endif - if (kUseUpdateFseState) { - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - } else { - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */ + U32 const ll0 = (llDInfo->baseValue == 0); + if (LIKELY((ofBits == 0))) { + offset = seqState->prevOffset[ll0]; + seqState->prevOffset[1] = seqState->prevOffset[!ll0]; + seqState->prevOffset[0] = offset; + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; } + + #if defined(__clang__) + if (UNLIKELY(mlBits > 0)) + #else + if (mlBits > 0) + #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + #if defined(__clang__) + if (UNLIKELY(llBits > 0)) + #else + if (llBits > 0) + #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ } return seq; @@ -1108,7 +1083,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; @@ -1122,7 +1097,6 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, /* Regen sequences */ if (nbSeq) { seqState_t seqState; - size_t error = 0; dctx->fseEntropy = 1; { U32 i; for (i=0; ientropy.rep[i]; } RETURN_ERROR_IF( @@ -1156,13 +1130,14 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, * If you see most cycles served out of the DSB you've hit the good case. * If it is pretty even then you may be in an okay case. * - * I've been able to reproduce this issue on the following CPUs: + * This issue has been reproduced on the following CPUs: * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 * Use Instruments->Counters to get DSB/MITE cycles. * I never got performance swings, but I was able to * go from the good case of mostly DSB to half of the * cycles served from MITE. * - Coffeelake: Intel i9-9900k + * - Coffeelake: Intel i7-9700k * * I haven't been able to reproduce the instability or DSB misses on any * of the following CPUS: @@ -1175,33 +1150,35 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, * * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 */ + __asm__(".p2align 6"); + __asm__("nop"); __asm__(".p2align 5"); __asm__("nop"); +# if __GNUC__ >= 9 && __GNUC__ < 11 + /* better for gcc-9 and gcc-10, worse for clang and gcc-8, gcc-11 */ + __asm__(".p2align 3"); +# else __asm__(".p2align 4"); +# endif #endif for ( ; ; ) { - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch); + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - BIT_reloadDStream(&(seqState.DStream)); op += oneSeqSize; - /* gcc and clang both don't like early returns in this loop. - * Instead break and check for an error at the end of the loop. - */ - if (UNLIKELY(ZSTD_isError(oneSeqSize))) { - error = oneSeqSize; + if (UNLIKELY(!--nbSeq)) break; - } - if (UNLIKELY(!--nbSeq)) break; + BIT_reloadDStream(&(seqState.DStream)); } /* check if reached exact end */ DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); - if (ZSTD_isError(error)) return error; RETURN_ERROR_IF(nbSeq, corruption_detected, ""); RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); /* save reps for next block */ @@ -1232,6 +1209,24 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +FORCE_INLINE_TEMPLATE size_t +ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) +{ + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; + const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : memory address is only used for prefetching, not for dereferencing */ + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +} + +/* This decoding function employs prefetching + * to reduce latency impact of cache misses. + * It's generally employed when block contains a significant portion of long-distance matches + * or when coupled with a "cold" dictionary */ FORCE_INLINE_TEMPLATE size_t ZSTD_decompressSequencesLong_body( ZSTD_DCtx* dctx, @@ -1242,7 +1237,7 @@ ZSTD_decompressSequencesLong_body( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; @@ -1254,18 +1249,17 @@ ZSTD_decompressSequencesLong_body( /* Regen sequences */ if (nbSeq) { -#define STORED_SEQS 4 +#define STORED_SEQS 8 #define STORED_SEQS_MASK (STORED_SEQS-1) -#define ADVANCED_SEQS 4 +#define ADVANCED_SEQS STORED_SEQS seq_t sequences[STORED_SEQS]; int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); seqState_t seqState; int seqNb; + size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ + dctx->fseEntropy = 1; { int i; for (i=0; ientropy.rep[i]; } - seqState.prefixStart = prefixStart; - seqState.pos = (size_t)(op-prefixStart); - seqState.dictEnd = dictEnd; assert(dst != NULL); assert(iend >= ip); RETURN_ERROR_IF( @@ -1277,21 +1271,23 @@ ZSTD_decompressSequencesLong_body( /* prepare in advance */ for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNbpreviousDstEnd) { /* not contiguous */ + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ dctx->dictEnd = dctx->previousDstEnd; dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); dctx->prefixStart = dst; @@ -1533,7 +1529,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize) { size_t dSize; - ZSTD_checkContinuity(dctx, dst); + ZSTD_checkContinuity(dctx, dst, dstCapacity); dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); dctx->previousDstEnd = (char*)dst + dSize; return dSize; diff --git a/zstd_decompress_block.h b/zstd_decompress_block.h index e6bd839..3bac9e9 100644 --- a/zstd_decompress_block.h +++ b/zstd_decompress_block.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_decompress_internal.h b/zstd_decompress_internal.h index b604b22..2215b09 100644 --- a/zstd_decompress_internal.h +++ b/zstd_decompress_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -99,6 +99,13 @@ typedef enum { ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */ } ZSTD_dictUses_e; +/* Hashset for storing references to multiple ZSTD_DDict within ZSTD_DCtx */ +typedef struct { + const ZSTD_DDict** ddictPtrTable; + size_t ddictPtrTableSize; + size_t ddictPtrCount; +} ZSTD_DDictHashSet; + struct ZSTD_DCtx_s { const ZSTD_seqSymbol* LLTptr; @@ -113,6 +120,7 @@ struct ZSTD_DCtx_s const void* dictEnd; /* end of previous segment */ size_t expected; ZSTD_frameHeader fParams; + U64 processedCSize; U64 decodedSize; blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ ZSTD_dStage stage; @@ -136,6 +144,8 @@ struct ZSTD_DCtx_s U32 dictID; int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */ ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ /* streaming */ ZSTD_dStreamStage streamStage; @@ -166,6 +176,11 @@ struct ZSTD_DCtx_s void const* dictContentBeginForFuzzing; void const* dictContentEndForFuzzing; #endif + + /* Tracing */ +#if ZSTD_TRACE + ZSTD_TraceCtx traceCtx; +#endif }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ @@ -184,7 +199,7 @@ size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, * If yes, do nothing (continue on current segment). * If not, classify previous segment as "external dictionary", and start a new segment. * This function cannot fail. */ -void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst); +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize); #endif /* ZSTD_DECOMPRESS_INTERNAL_H */ diff --git a/zstd_deps.h b/zstd_deps.h index 0fb8b78..1421134 100644 --- a/zstd_deps.h +++ b/zstd_deps.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Facebook, Inc. + * Copyright (c) Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_double_fast.c b/zstd_double_fast.c index ef12a52..b9e3c5e 100644 --- a/zstd_double_fast.c +++ b/zstd_double_fast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -409,7 +409,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ - & (repIndex > dictStartIndex)) + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; @@ -477,7 +477,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( U32 const repIndex2 = current2 - offset_2; const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ - & (repIndex2 > dictStartIndex)) + & (offset_2 <= current2 - dictStartIndex)) && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; diff --git a/zstd_double_fast.h b/zstd_double_fast.h index 73a2002..b4ee5ef 100644 --- a/zstd_double_fast.h +++ b/zstd_double_fast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_errors.h b/zstd_errors.h index 6d0d003..fa3686b 100644 --- a/zstd_errors.h +++ b/zstd_errors.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_fast.c b/zstd_fast.c index db7ce83..b6a1a7e 100644 --- a/zstd_fast.c +++ b/zstd_fast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -242,7 +242,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( assert(endIndex - prefixStartIndex <= maxDistance); (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ - /* ensure there will be no no underflow + /* ensure there will be no underflow * when translating a dict index into a local index */ assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); @@ -416,9 +416,9 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( const BYTE* const repMatch = repBase + repIndex; hashTable[h] = curr; /* update hash table */ DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); - assert(offset_1 <= curr +1); /* check repIndex */ - if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) + if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ + & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; @@ -453,7 +453,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( U32 const current2 = (U32)(ip-base); U32 const repIndex2 = current2 - offset_2; const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */ + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; diff --git a/zstd_fast.h b/zstd_fast.h index 3e15cc8..9b56fb6 100644 --- a/zstd_fast.h +++ b/zstd_fast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_internal.h b/zstd_internal.h index d94c592..3b56f1a 100644 --- a/zstd_internal.h +++ b/zstd_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -19,9 +19,6 @@ /*-************************************* * Dependencies ***************************************/ -#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) -#include -#endif #include "compiler.h" #include "mem.h" #include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglevel */ @@ -36,6 +33,11 @@ # define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ #endif #include "xxhash.h" /* XXH_reset, update, digest */ +#ifndef ZSTD_NO_TRACE +# include "zstd_trace.h" +#else +# define ZSTD_TRACE 0 +#endif #if defined (__cplusplus) extern "C" { @@ -242,7 +244,7 @@ static UNUSED_ATTR const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG; * Shared functions to include for inlining *********************************************/ static void ZSTD_copy8(void* dst, const void* src) { -#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) +#if defined(ZSTD_ARCH_ARM_NEON) vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src)); #else ZSTD_memcpy(dst, src, 8); @@ -251,7 +253,7 @@ static void ZSTD_copy8(void* dst, const void* src) { #define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; } static void ZSTD_copy16(void* dst, const void* src) { -#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) +#if defined(ZSTD_ARCH_ARM_NEON) vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src)); #else ZSTD_memcpy(dst, src, 16); @@ -347,11 +349,18 @@ typedef enum { * Private declarations *********************************************/ typedef struct seqDef_s { - U32 offset; /* Offset code of the sequence */ + U32 offset; /* offset == rawOffset + ZSTD_REP_NUM, or equivalently, offCode + 1 */ U16 litLength; U16 matchLength; } seqDef; +/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */ +typedef enum { + ZSTD_llt_none = 0, /* no longLengthType */ + ZSTD_llt_literalLength = 1, /* represents a long literal */ + ZSTD_llt_matchLength = 2 /* represents a long match */ +} ZSTD_longLengthType_e; + typedef struct { seqDef* sequencesStart; seqDef* sequences; /* ptr to end of sequences */ @@ -363,12 +372,12 @@ typedef struct { size_t maxNbSeq; size_t maxNbLit; - /* longLengthPos and longLengthID to allow us to represent either a single litLength or matchLength + /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment * the existing value of the litLength or matchLength by 0x10000. */ - U32 longLengthID; /* 0 == no longLength; 1 == Represent the long literal; 2 == Represent the long match; */ - U32 longLengthPos; /* Index of the sequence to apply long length modification to */ + ZSTD_longLengthType_e longLengthType; + U32 longLengthPos; /* Index of the sequence to apply long length modification to */ } seqStore_t; typedef struct { @@ -378,7 +387,7 @@ typedef struct { /** * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences - * indicated by longLengthPos and longLengthID, and adds MINMATCH back to matchLength. + * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. */ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) { @@ -386,10 +395,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore seqLen.litLength = seq->litLength; seqLen.matchLength = seq->matchLength + MINMATCH; if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { - if (seqStore->longLengthID == 1) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { seqLen.litLength += 0xFFFF; } - if (seqStore->longLengthID == 2) { + if (seqStore->longLengthType == ZSTD_llt_matchLength) { seqLen.matchLength += 0xFFFF; } } diff --git a/zstd_lazy.c b/zstd_lazy.c index 49ec1b0..70319d5 100644 --- a/zstd_lazy.c +++ b/zstd_lazy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -438,43 +438,9 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( } } - - -/* ********************************* -* Hash Chain +/*********************************** +* Dedicated dict search ***********************************/ -#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] - -/* Update chains up to ip (excluded) - Assumption : always within prefix (i.e. not within extDict) */ -FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( - ZSTD_matchState_t* ms, - const ZSTD_compressionParameters* const cParams, - const BYTE* ip, U32 const mls) -{ - U32* const hashTable = ms->hashTable; - const U32 hashLog = cParams->hashLog; - U32* const chainTable = ms->chainTable; - const U32 chainMask = (1 << cParams->chainLog) - 1; - const BYTE* const base = ms->window.base; - const U32 target = (U32)(ip - base); - U32 idx = ms->nextToUpdate; - - while(idx < target) { /* catch up */ - size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); - NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; - hashTable[h] = idx; - idx++; - } - - ms->nextToUpdate = target; - return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; -} - -U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { - const ZSTD_compressionParameters* const cParams = &ms->cParams; - return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); -} void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) { @@ -484,7 +450,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B U32* const chainTable = ms->chainTable; U32 const chainSize = 1 << ms->cParams.chainLog; U32 idx = ms->nextToUpdate; - U32 const minChain = chainSize < target ? target - chainSize : idx; + U32 const minChain = chainSize < target - idx ? target - chainSize : idx; U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; U32 const cacheSize = bucketSize - 1; U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; @@ -500,11 +466,10 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; - U32 hashIdx; assert(ms->cParams.chainLog <= 24); - assert(ms->cParams.hashLog >= ms->cParams.chainLog); + assert(ms->cParams.hashLog > ms->cParams.chainLog); assert(idx != 0); assert(tmpMinChain <= minChain); @@ -535,7 +500,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B if (count == cacheSize) { for (count = 0; count < chainLimit;) { if (i < minChain) { - if (!i || countBeyondMinChain++ > cacheSize) { + if (!i || ++countBeyondMinChain > cacheSize) { /* only allow pulling `cacheSize` number of entries * into the cache or chainTable beyond `minChain`, * to replace the entries pulled out of the @@ -591,6 +556,139 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B ms->nextToUpdate = target; } +/* Returns the longest match length found in the dedicated dict search structure. + * If none are longer than the argument ml, then ml will be returned. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, + const ZSTD_matchState_t* const dms, + const BYTE* const ip, const BYTE* const iLimit, + const BYTE* const prefixStart, const U32 curr, + const U32 dictLimit, const size_t ddsIdx) { + const U32 ddsLowestIndex = dms->window.dictLimit; + const BYTE* const ddsBase = dms->window.base; + const BYTE* const ddsEnd = dms->window.nextSrc; + const U32 ddsSize = (U32)(ddsEnd - ddsBase); + const U32 ddsIndexDelta = dictLimit - ddsSize; + const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); + const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; + U32 ddsAttempt; + U32 matchIndex; + + for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { + PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); + } + + { + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 const chainIndex = chainPackedPointer >> 8; + + PREFETCH_L1(&dms->chainTable[chainIndex]); + } + + for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; + match = ddsBase + matchIndex; + + if (!matchIndex) { + return ml; + } + + /* guaranteed by table construction */ + (void)ddsLowestIndex; + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; + } + } + } + + { + U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; + U32 chainIndex = chainPackedPointer >> 8; + U32 const chainLength = chainPackedPointer & 0xFF; + U32 const chainAttempts = nbAttempts - ddsAttempt; + U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; + U32 chainAttempt; + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { + PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); + } + + for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { + size_t currentMl=0; + const BYTE* match; + matchIndex = dms->chainTable[chainIndex]; + match = ddsBase + matchIndex; + + /* guaranteed by table construction */ + assert(matchIndex >= ddsLowestIndex); + assert(match+4 <= ddsEnd); + if (MEM_read32(match) == MEM_read32(ip)) { + /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } + } + return ml; +} + + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, + const BYTE* ip, U32 const mls) +{ + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; + U32* const chainTable = ms->chainTable; + const U32 chainMask = (1 << cParams->chainLog) - 1; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + while(idx < target) { /* catch up */ + size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); +} /* inlining is important to hardwire a hot branch (template emulation) */ FORCE_INLINE_TEMPLATE @@ -661,90 +759,8 @@ size_t ZSTD_HcFindBestMatch_generic ( } if (dictMode == ZSTD_dedicatedDictSearch) { - const U32 ddsLowestIndex = dms->window.dictLimit; - const BYTE* const ddsBase = dms->window.base; - const BYTE* const ddsEnd = dms->window.nextSrc; - const U32 ddsSize = (U32)(ddsEnd - ddsBase); - const U32 ddsIndexDelta = dictLimit - ddsSize; - const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); - const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; - U32 ddsAttempt; - - for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { - PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); - } - - { - U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; - U32 const chainIndex = chainPackedPointer >> 8; - - PREFETCH_L1(&dms->chainTable[chainIndex]); - } - - for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { - size_t currentMl=0; - const BYTE* match; - matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; - match = ddsBase + matchIndex; - - if (!matchIndex) { - return ml; - } - - /* guaranteed by table construction */ - (void)ddsLowestIndex; - assert(matchIndex >= ddsLowestIndex); - assert(match+4 <= ddsEnd); - if (MEM_read32(match) == MEM_read32(ip)) { - /* assumption : matchIndex <= dictLimit-4 (by table construction) */ - currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; - } - - /* save best solution */ - if (currentMl > ml) { - ml = currentMl; - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; - if (ip+currentMl == iLimit) { - /* best possible, avoids read overflow on next attempt */ - return ml; - } - } - } - - { - U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; - U32 chainIndex = chainPackedPointer >> 8; - U32 const chainLength = chainPackedPointer & 0xFF; - U32 const chainAttempts = nbAttempts - ddsAttempt; - U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; - U32 chainAttempt; - - for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { - PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); - } - - for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { - size_t currentMl=0; - const BYTE* match; - matchIndex = dms->chainTable[chainIndex]; - match = ddsBase + matchIndex; - - /* guaranteed by table construction */ - assert(matchIndex >= ddsLowestIndex); - assert(match+4 <= ddsEnd); - if (MEM_read32(match) == MEM_read32(ip)) { - /* assumption : matchIndex <= dictLimit-4 (by table construction) */ - currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; - } - - /* save best solution */ - if (currentMl > ml) { - ml = currentMl; - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; - if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ - } - } - } + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); } else if (dictMode == ZSTD_dictMatchState) { const U32* const dmsChainTable = dms->chainTable; const U32 dmsChainSize = (1 << dms->cParams.chainLog); @@ -845,11 +861,602 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( } } +/* ********************************* +* (SIMD) Row-based matchfinder +***********************************/ +/* Constants for row-based hash */ +#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ +#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) +#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) + +typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ + +/* ZSTD_VecMask_next(): + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { + assert(val != 0); +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r=0; + return _BitScanForward64(&r, val) ? (U32)r : 0; /* _BitScanForward64 not defined outside of x86/64 */ +# elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) + if (sizeof(size_t) == 4) { + U32 mostSignificantWord = (U32)(val >> 32); + U32 leastSignificantWord = (U32)val; + if (leastSignificantWord == 0) { + return 32 + (U32)__builtin_ctz(mostSignificantWord); + } else { + return (U32)__builtin_ctz(leastSignificantWord); + } + } else { + return (U32)__builtin_ctzll(val); + } +# else + /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count + * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer + */ + val = ~val & (val - 1ULL); /* Lowest set bit mask */ + val = val - ((val >> 1) & 0x5555555555555555); + val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); + return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +# endif +} + +/* ZSTD_rotateRight_*(): + * Rotates a bitfield to the right by "count" bits. + * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts + */ +FORCE_INLINE_TEMPLATE +U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { + assert(count < 64); + count &= 0x3F; /* for fickle pattern recognition */ + return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { + assert(count < 32); + count &= 0x1F; /* for fickle pattern recognition */ + return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +} + +FORCE_INLINE_TEMPLATE +U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { + assert(count < 16); + count &= 0x0F; /* for fickle pattern recognition */ + return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +} + +/* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" + * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { + U32 const next = (*tagRow - 1) & rowMask; + *tagRow = (BYTE)next; + return next; +} + +/* ZSTD_isAligned(): + * Checks that a pointer is aligned to "align" bytes which must be a power of 2. + */ +MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + assert((align & (align - 1)) == 0); + return (((size_t)ptr) & (align - 1)) == 0; +} + +/* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); + /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ + } + PREFETCH_L1(tagTable + relRow); + if (rowLog == 6) { + PREFETCH_L1(tagTable + relRow + 32); + } + assert(rowLog == 4 || rowLog == 5 || rowLog == 6); + assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ + assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ +} + +/* ZSTD_row_fillHashCache(): + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) +{ + U32 const* const hashTable = ms->hashTable; + U16 const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { + U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; + } + + DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1], + ms->hashCache[2], ms->hashCache[3], ms->hashCache[4], + ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]); +} + +/* ZSTD_row_nextCachedHash(): + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, + U16 const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, + U32 const rowLog, U32 const mls) +{ + U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; + cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash; + return hash; + } +} + +/* ZSTD_row_update_internal(): + * Inserts the byte at ip into the appropriate position in the hash table. + * Determines the relative row, and the position within the {16, 32} entry row to insert at. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) +{ + U32* const hashTable = ms->hashTable; + U16* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target); + for (; idx < target; ++idx) { + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls) + : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. + Explicit cast allows us to get exact desired position within each row */ + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + + assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); + ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = idx; + } + ms->nextToUpdate = target; +} + +/* ZSTD_row_update(): + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + const U32 rowLog = MAX(MIN(ms->cParams.searchLog, 6), 4); + const U32 rowMask = (1u << rowLog) - 1; + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); +} + +/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches + * the hash at the nth position in a row of the tagTable. + * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield + * to match up with the actual layout of the entries within the hashTable */ +FORCE_INLINE_TEMPLATE +ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) { + const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); +#if defined(ZSTD_ARCH_X86_SSE2) + if (rowEntries == 16) { + const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)src); + const __m128i equalMask = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(tag)); + const U16 matches = (U16)_mm_movemask_epi8(equalMask); + return ZSTD_rotateRight_U16(matches, head); + } else if (rowEntries == 32) { + const __m128i chunk0 = _mm_loadu_si128((const __m128i*)(const void*)&src[0]); + const __m128i chunk1 = _mm_loadu_si128((const __m128i*)(const void*)&src[16]); + const __m128i equalMask0 = _mm_cmpeq_epi8(chunk0, _mm_set1_epi8(tag)); + const __m128i equalMask1 = _mm_cmpeq_epi8(chunk1, _mm_set1_epi8(tag)); + const U32 lo = (U32)_mm_movemask_epi8(equalMask0); + const U32 hi = (U32)_mm_movemask_epi8(equalMask1); + return ZSTD_rotateRight_U32((hi << 16) | lo, head); + } else { /* rowEntries == 64 */ + const __m128i chunk0 = _mm_loadu_si128((const __m128i*)(const void*)&src[0]); + const __m128i chunk1 = _mm_loadu_si128((const __m128i*)(const void*)&src[16]); + const __m128i chunk2 = _mm_loadu_si128((const __m128i*)(const void*)&src[32]); + const __m128i chunk3 = _mm_loadu_si128((const __m128i*)(const void*)&src[48]); + const __m128i comparisonMask = _mm_set1_epi8(tag); + const __m128i equalMask0 = _mm_cmpeq_epi8(chunk0, comparisonMask); + const __m128i equalMask1 = _mm_cmpeq_epi8(chunk1, comparisonMask); + const __m128i equalMask2 = _mm_cmpeq_epi8(chunk2, comparisonMask); + const __m128i equalMask3 = _mm_cmpeq_epi8(chunk3, comparisonMask); + const U64 mask0 = (U64)_mm_movemask_epi8(equalMask0); + const U64 mask1 = (U64)_mm_movemask_epi8(equalMask1); + const U64 mask2 = (U64)_mm_movemask_epi8(equalMask2); + const U64 mask3 = (U64)_mm_movemask_epi8(equalMask3); + return ZSTD_rotateRight_U64((mask3 << 48) | (mask2 << 32) | (mask1 << 16) | mask0, head); + } +#else +# if defined(ZSTD_ARCH_ARM_NEON) + if (MEM_isLittleEndian()) { + if (rowEntries == 16) { + const uint8x16_t chunk = vld1q_u8(src); + const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); + const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); + const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); + const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); + const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); + const U16 hi = (U16)vgetq_lane_u8(t3, 8); + const U16 lo = (U16)vgetq_lane_u8(t3, 0); + return ZSTD_rotateRight_U16((hi << 8) | lo, head); + } else if (rowEntries == 32) { + const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); + const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); + const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); + const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); + const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); + const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); + const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); + const uint8x8_t t0 = vreinterpret_u8_s8(pack0); + const uint8x8_t t1 = vreinterpret_u8_s8(pack1); + const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); + const uint8x8x2_t t3 = vuzp_u8(t2, t0); + const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); + const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); + return ZSTD_rotateRight_U32(matches, head); + } else { /* rowEntries == 64 */ + const uint8x16x4_t chunk = vld4q_u8(src); + const uint8x16_t dup = vdupq_n_u8(tag); + const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); + const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); + const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); + const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); + + const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); + const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); + const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); + const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); + const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); + const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); + return ZSTD_rotateRight_U64(matches, head); + } + } +# endif + { /* SWAR */ + const size_t chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; + const size_t x80 = x01 << 7; + const size_t splatChar = tag * x01; + ZSTD_VecMask matches = 0; + int i = rowEntries - chunkSize; + assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); + if (MEM_isLittleEndian()) { /* runtime check so have two loops */ + const size_t extractMagic = (xFF / 0x7F) >> chunkSize; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= (chunk * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } else { /* big endian: reverse bits during extraction */ + const size_t msb = xFF ^ (xFF >> 1); + const size_t extractMagic = (msb / 0x1FF) | msb; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } + matches = ~matches; + if (rowEntries == 16) { + return ZSTD_rotateRight_U16((U16)matches, head); + } else if (rowEntries == 32) { + return ZSTD_rotateRight_U32((U32)matches, head); + } else { + return ZSTD_rotateRight_U64((U64)matches, head); + } + } +#endif +} + +/* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: + * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" + * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines + * which row to insert into. + * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can + * be considered as a circular buffer with a "head" index that resides in the tagTable. + * - Also insert the "tag" into the equivalent row and position in the tagTable. + * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. + * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, + * for alignment/performance reasons, leaving some bytes unused. + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_RowFindBestMatch_generic ( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, + const U32 rowLog) +{ + U32* const hashTable = ms->hashTable; + U16* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 curr = (U32)(ip-base); + const U32 maxDistance = 1U << cParams->windowLog; + const U32 lowestValid = ms->window.lowLimit; + const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid; + const U32 isDictionary = (ms->loadedDictEnd != 0); + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms = ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; + U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ + U32 dmsTag = 0; + U32* dmsRow = NULL; + BYTE* dmsTagRow = NULL; + + if (dictMode == ZSTD_dedicatedDictSearch) { + const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; + { /* Prefetch DDS hashtable entry */ + ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG; + PREFETCH_L1(&dms->hashTable[ddsIdx]); + } + ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0; + } + + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; + U16* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; + dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow); + dmsRow = dmsHashTable + dmsRelRow; + ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); + } + + /* Update the hashTable and tagTable up to (but not including) ip */ + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); + { /* Get the hash for ip, compute the appropriate row */ + U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); + U32 const head = *tagRow & rowMask; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); + + /* Cycle through the matches and prefetch */ + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { + U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; + U32 const matchIndex = row[matchPos]; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + PREFETCH_L1(base + matchIndex); + } else { + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex < curr); + assert(matchIndex >= lowLimit); + + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ + if (match[ml] == ip[ml]) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } + } + + if (dictMode == ZSTD_dedicatedDictSearch) { + ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, + ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); + } else if (dictMode == ZSTD_dictMatchState) { + /* TODO: Measure and potentially add prefetching to DMS */ + const U32 dmsLowestIndex = dms->window.dictLimit; + const BYTE* const dmsBase = dms->window.base; + const BYTE* const dmsEnd = dms->window.nextSrc; + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + + { U32 const head = *dmsTagRow & rowMask; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; + ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); + + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { + U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex = matchBuffer[currMatch]; + size_t currentMl=0; + assert(matchIndex >= dmsLowestIndex); + assert(matchIndex < curr); + + { const BYTE* const match = dmsBase + matchIndex; + assert(match+4 <= dmsEnd); + if (MEM_read32(match) == MEM_read32(ip)) + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; + } + + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; + } + } + } + } + return ml; +} + +/* Inlining is important to hardwire a hot branch (template emulation) */ +FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog); + case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog); + case 7 : + case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog); + } +} + +FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 6); + switch(cappedSearchLog) + { + default : + case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4); + case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5); + case 6 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 6); + } +} + +FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 6); + switch(cappedSearchLog) + { + default : + case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4); + case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5); + case 6 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 6); + } +} + +FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 6); + switch(cappedSearchLog) + { + default : + case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4); + case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5); + case 6 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 6); + } +} + +FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 6); + switch(cappedSearchLog) + { + default : + case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4); + case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5); + case 6 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 6); + } +} + /* ******************************* * Common parser - lazy strategy *********************************/ -typedef enum { search_hashChain, search_binaryTree } searchMethod_e; +typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_generic( @@ -863,10 +1470,11 @@ ZSTD_compressBlock_lazy_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; + const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base = ms->window.base; const U32 prefixLowestIndex = ms->window.dictLimit; const BYTE* const prefixLowest = base + prefixLowestIndex; + const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5; typedef size_t (*searchMax_f)( ZSTD_matchState_t* ms, @@ -878,26 +1486,30 @@ ZSTD_compressBlock_lazy_generic( * that should never occur (extDict modes go to the other implementation * below and there is no DDSS for binary tree search yet). */ - const searchMax_f searchFuncs[4][2] = { + const searchMax_f searchFuncs[4][3] = { { ZSTD_HcFindBestMatch_selectMLS, - ZSTD_BtFindBestMatch_selectMLS + ZSTD_BtFindBestMatch_selectMLS, + ZSTD_RowFindBestMatch_selectRowLog }, { + NULL, NULL, NULL }, { ZSTD_HcFindBestMatch_dictMatchState_selectMLS, - ZSTD_BtFindBestMatch_dictMatchState_selectMLS + ZSTD_BtFindBestMatch_dictMatchState_selectMLS, + ZSTD_RowFindBestMatch_dictMatchState_selectRowLog }, { ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, - NULL + NULL, + ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog } }; - searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree]; + searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod]; U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; const int isDMS = dictMode == ZSTD_dictMatchState; @@ -915,9 +1527,7 @@ ZSTD_compressBlock_lazy_generic( assert(searchMax != NULL); - DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode); - - /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod); ip += (dictAndPrefixLength == 0); if (dictMode == ZSTD_noDict) { U32 const curr = (U32)(ip - base); @@ -933,6 +1543,12 @@ ZSTD_compressBlock_lazy_generic( assert(offset_2 <= dictAndPrefixLength); } + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, + MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), + ms->nextToUpdate, ilimit); + } + /* Match Loop */ #if defined(__GNUC__) && defined(__x86_64__) /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the @@ -1198,6 +1814,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); } +/* Row-based matchfinder */ +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); +} + + +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); +} FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_extDict_generic( @@ -1210,7 +1890,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; + const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base = ms->window.base; const U32 dictLimit = ms->window.dictLimit; const BYTE* const prefixStart = base + dictLimit; @@ -1218,18 +1898,28 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const dictEnd = dictBase + dictLimit; const BYTE* const dictStart = dictBase + ms->window.lowLimit; const U32 windowLog = ms->cParams.windowLog; + const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5; typedef size_t (*searchMax_f)( ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; - + const searchMax_f searchFuncs[3] = { + ZSTD_HcFindBestMatch_extDict_selectMLS, + ZSTD_BtFindBestMatch_extDict_selectMLS, + ZSTD_RowFindBestMatch_extDict_selectRowLog + }; + searchMax_f searchMax = searchFuncs[(int)searchMethod]; U32 offset_1 = rep[0], offset_2 = rep[1]; - DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); /* init */ ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, + MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), + ms->nextToUpdate, ilimit); + } /* Match Loop */ #if defined(__GNUC__) && defined(__x86_64__) @@ -1249,7 +1939,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = (U32)(curr+1 - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ if (MEM_read32(ip+1) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1280,7 +1971,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1311,7 +2003,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1357,7 +2050,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex = repCurrent - offset_2; const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1410,3 +2104,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict( { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); } + +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); +} + +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); +} + +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); +} diff --git a/zstd_lazy.h b/zstd_lazy.h index d0214d5..150f7b3 100644 --- a/zstd_lazy.h +++ b/zstd_lazy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -26,6 +26,7 @@ extern "C" { #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); @@ -43,6 +44,15 @@ size_t ZSTD_compressBlock_lazy( size_t ZSTD_compressBlock_greedy( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -56,6 +66,15 @@ size_t ZSTD_compressBlock_lazy_dictMatchState( size_t ZSTD_compressBlock_greedy_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -66,6 +85,15 @@ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_greedy_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -76,9 +104,19 @@ size_t ZSTD_compressBlock_lazy_extDict( size_t ZSTD_compressBlock_lazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); + #if defined (__cplusplus) } diff --git a/zstd_ldm.c b/zstd_ldm.c index 3776dd1..9714438 100644 --- a/zstd_ldm.c +++ b/zstd_ldm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -11,13 +11,126 @@ #include "zstd_ldm.h" #include "debug.h" +#include "xxhash.h" #include "zstd_fast.h" /* ZSTD_fillHashTable() */ #include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ +#include "zstd_ldm_geartab.h" #define LDM_BUCKET_SIZE_LOG 3 #define LDM_MIN_MATCH_LENGTH 64 #define LDM_HASH_RLOG 7 -#define LDM_HASH_CHAR_OFFSET 10 + +typedef struct { + U64 rolling; + U64 stopMask; +} ldmRollingHashState_t; + +/** ZSTD_ldm_gear_init(): + * + * Initializes the rolling hash state such that it will honor the + * settings in params. */ +static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* params) +{ + unsigned maxBitsInMask = MIN(params->minMatchLength, 64); + unsigned hashRateLog = params->hashRateLog; + + state->rolling = ~(U32)0; + + /* The choice of the splitting criterion is subject to two conditions: + * 1. it has to trigger on average every 2^(hashRateLog) bytes; + * 2. ideally, it has to depend on a window of minMatchLength bytes. + * + * In the gear hash algorithm, bit n depends on the last n bytes; + * so in order to obtain a good quality splitting criterion it is + * preferable to use bits with high weight. + * + * To match condition 1 we use a mask with hashRateLog bits set + * and, because of the previous remark, we make sure these bits + * have the highest possible weight while still respecting + * condition 2. + */ + if (hashRateLog > 0 && hashRateLog <= maxBitsInMask) { + state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog); + } else { + /* In this degenerate case we simply honor the hash rate. */ + state->stopMask = ((U64)1 << hashRateLog) - 1; + } +} + +/** ZSTD_ldm_gear_reset() + * Feeds [data, data + minMatchLength) into the hash without registering any + * splits. This effectively resets the hash state. This is used when skipping + * over data, either at the beginning of a block, or skipping sections. + */ +static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, + BYTE const* data, size_t minMatchLength) +{ + U64 hash = state->rolling; + size_t n = 0; + +#define GEAR_ITER_ONCE() do { \ + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n += 1; \ + } while (0) + while (n + 3 < minMatchLength) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < minMatchLength) { + GEAR_ITER_ONCE(); + } +#undef GEAR_ITER_ONCE +} + +/** ZSTD_ldm_gear_feed(): + * + * Registers in the splits array all the split points found in the first + * size bytes following the data pointer. This function terminates when + * either all the data has been processed or LDM_BATCH_SIZE splits are + * present in the splits array. + * + * Precondition: The splits array must not be full. + * Returns: The number of bytes processed. */ +static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state, + BYTE const* data, size_t size, + size_t* splits, unsigned* numSplits) +{ + size_t n; + U64 hash, mask; + + hash = state->rolling; + mask = state->stopMask; + n = 0; + +#define GEAR_ITER_ONCE() do { \ + hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n += 1; \ + if (UNLIKELY((hash & mask) == 0)) { \ + splits[*numSplits] = n; \ + *numSplits += 1; \ + if (*numSplits == LDM_BATCH_SIZE) \ + goto done; \ + } \ + } while (0) + + while (n + 3 < size) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < size) { + GEAR_ITER_ONCE(); + } + +#undef GEAR_ITER_ONCE + +done: + state->rolling = hash; + return n; +} void ZSTD_ldm_adjustParameters(ldmParams_t* params, ZSTD_compressionParameters const* cParams) @@ -54,41 +167,6 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; } -/** ZSTD_ldm_getSmallHash() : - * numBits should be <= 32 - * If numBits==0, returns 0. - * @return : the most significant numBits of value. */ -static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits) -{ - assert(numBits <= 32); - return numBits == 0 ? 0 : (U32)(value >> (64 - numBits)); -} - -/** ZSTD_ldm_getChecksum() : - * numBitsToDiscard should be <= 32 - * @return : the next most significant 32 bits after numBitsToDiscard */ -static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard) -{ - assert(numBitsToDiscard <= 32); - return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF; -} - -/** ZSTD_ldm_getTag() ; - * Given the hash, returns the most significant numTagBits bits - * after (32 + hbits) bits. - * - * If there are not enough bits remaining, return the last - * numTagBits bits. */ -static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits) -{ - assert(numTagBits < 32 && hbits <= 32); - if (32 - hbits < numTagBits) { - return hash & (((U32)1 << numTagBits) - 1); - } else { - return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1); - } -} - /** ZSTD_ldm_getBucket() : * Returns a pointer to the start of the bucket associated with hash. */ static ldmEntry_t* ZSTD_ldm_getBucket( @@ -103,38 +181,12 @@ static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, size_t const hash, const ldmEntry_t entry, ldmParams_t const ldmParams) { - BYTE* const bucketOffsets = ldmState->bucketOffsets; - *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry; - bucketOffsets[hash]++; - bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1; -} + BYTE* const pOffset = ldmState->bucketOffsets + hash; + unsigned const offset = *pOffset; + + *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry; + *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1)); -/** ZSTD_ldm_makeEntryAndInsertByTag() : - * - * Gets the small hash, checksum, and tag from the rollingHash. - * - * If the tag matches (1 << ldmParams.hashRateLog)-1, then - * creates an ldmEntry from the offset, and inserts it into the hash table. - * - * hBits is the length of the small hash, which is the most significant hBits - * of rollingHash. The checksum is the next 32 most significant bits, followed - * by ldmParams.hashRateLog bits that make up the tag. */ -static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState, - U64 const rollingHash, - U32 const hBits, - U32 const offset, - ldmParams_t const ldmParams) -{ - U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashRateLog); - U32 const tagMask = ((U32)1 << ldmParams.hashRateLog) - 1; - if (tag == tagMask) { - U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits); - U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); - ldmEntry_t entry; - entry.offset = offset; - entry.checksum = checksum; - ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams); - } } /** ZSTD_ldm_countBackwardsMatch() : @@ -212,43 +264,42 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, return 0; } -/** ZSTD_ldm_fillLdmHashTable() : - * - * Fills hashTable from (lastHashed + 1) to iend (non-inclusive). - * lastHash is the rolling hash that corresponds to lastHashed. - * - * Returns the rolling hash corresponding to position iend-1. */ -static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state, - U64 lastHash, const BYTE* lastHashed, - const BYTE* iend, const BYTE* base, - U32 hBits, ldmParams_t const ldmParams) -{ - U64 rollingHash = lastHash; - const BYTE* cur = lastHashed + 1; - - while (cur < iend) { - rollingHash = ZSTD_rollingHash_rotate(rollingHash, cur[-1], - cur[ldmParams.minMatchLength-1], - state->hashPower); - ZSTD_ldm_makeEntryAndInsertByTag(state, - rollingHash, hBits, - (U32)(cur - base), ldmParams); - ++cur; - } - return rollingHash; -} - void ZSTD_ldm_fillHashTable( - ldmState_t* state, const BYTE* ip, + ldmState_t* ldmState, const BYTE* ip, const BYTE* iend, ldmParams_t const* params) { + U32 const minMatchLength = params->minMatchLength; + U32 const hBits = params->hashLog - params->bucketSizeLog; + BYTE const* const base = ldmState->window.base; + BYTE const* const istart = ip; + ldmRollingHashState_t hashState; + size_t* const splits = ldmState->splitIndices; + unsigned numSplits; + DEBUGLOG(5, "ZSTD_ldm_fillHashTable"); - if ((size_t)(iend - ip) >= params->minMatchLength) { - U64 startingHash = ZSTD_rollingHash_compute(ip, params->minMatchLength); - ZSTD_ldm_fillLdmHashTable( - state, startingHash, ip, iend - params->minMatchLength, state->window.base, - params->hashLog - params->bucketSizeLog, - *params); + + ZSTD_ldm_gear_init(&hashState, params); + while (ip < iend) { + size_t hashed; + unsigned n; + + numSplits = 0; + hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + if (ip + splits[n] >= istart + minMatchLength) { + BYTE const* const split = ip + splits[n] - minMatchLength; + U64 const xxhash = XXH64(split, minMatchLength, 0); + U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); + ldmEntry_t entry; + + entry.offset = (U32)(split - base); + entry.checksum = (U32)(xxhash >> 32); + ZSTD_ldm_insertEntry(ldmState, hash, entry, *params); + } + } + + ip += hashed; } } @@ -274,11 +325,8 @@ static size_t ZSTD_ldm_generateSequences_internal( /* LDM parameters */ int const extDict = ZSTD_window_hasExtDict(ldmState->window); U32 const minMatchLength = params->minMatchLength; - U64 const hashPower = ldmState->hashPower; + U32 const entsPerBucket = 1U << params->bucketSizeLog; U32 const hBits = params->hashLog - params->bucketSizeLog; - U32 const ldmBucketSize = 1U << params->bucketSizeLog; - U32 const hashRateLog = params->hashRateLog; - U32 const ldmTagMask = (1U << params->hashRateLog) - 1; /* Prefix and extDict parameters */ U32 const dictLimit = ldmState->window.dictLimit; U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit; @@ -290,45 +338,69 @@ static size_t ZSTD_ldm_generateSequences_internal( /* Input bounds */ BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; - BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE); + BYTE const* const ilimit = iend - HASH_READ_SIZE; /* Input positions */ BYTE const* anchor = istart; BYTE const* ip = istart; - /* Rolling hash */ - BYTE const* lastHashed = NULL; - U64 rollingHash = 0; - - while (ip <= ilimit) { - size_t mLength; - U32 const curr = (U32)(ip - base); - size_t forwardMatchLength = 0, backwardMatchLength = 0; - ldmEntry_t* bestEntry = NULL; - if (ip != istart) { - rollingHash = ZSTD_rollingHash_rotate(rollingHash, lastHashed[0], - lastHashed[minMatchLength], - hashPower); - } else { - rollingHash = ZSTD_rollingHash_compute(ip, minMatchLength); + /* Rolling hash state */ + ldmRollingHashState_t hashState; + /* Arrays for staged-processing */ + size_t* const splits = ldmState->splitIndices; + ldmMatchCandidate_t* const candidates = ldmState->matchCandidates; + unsigned numSplits; + + if (srcSize < minMatchLength) + return iend - anchor; + + /* Initialize the rolling hash state with the first minMatchLength bytes */ + ZSTD_ldm_gear_init(&hashState, params); + ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); + ip += minMatchLength; + + while (ip < ilimit) { + size_t hashed; + unsigned n; + + numSplits = 0; + hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip, + splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + BYTE const* const split = ip + splits[n] - minMatchLength; + U64 const xxhash = XXH64(split, minMatchLength, 0); + U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); + + candidates[n].split = split; + candidates[n].hash = hash; + candidates[n].checksum = (U32)(xxhash >> 32); + candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params); + PREFETCH_L1(candidates[n].bucket); } - lastHashed = ip; - /* Do not insert and do not look for a match */ - if (ZSTD_ldm_getTag(rollingHash, hBits, hashRateLog) != ldmTagMask) { - ip++; - continue; - } + for (n = 0; n < numSplits; n++) { + size_t forwardMatchLength = 0, backwardMatchLength = 0, + bestMatchLength = 0, mLength; + U32 offset; + BYTE const* const split = candidates[n].split; + U32 const checksum = candidates[n].checksum; + U32 const hash = candidates[n].hash; + ldmEntry_t* const bucket = candidates[n].bucket; + ldmEntry_t const* cur; + ldmEntry_t const* bestEntry = NULL; + ldmEntry_t newEntry; + + newEntry.offset = (U32)(split - base); + newEntry.checksum = checksum; + + /* If a split point would generate a sequence overlapping with + * the previous one, we merely register it in the hash table and + * move on */ + if (split < anchor) { + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); + continue; + } - /* Get the best entry and compute the match lengths */ - { - ldmEntry_t* const bucket = - ZSTD_ldm_getBucket(ldmState, - ZSTD_ldm_getSmallHash(rollingHash, hBits), - *params); - ldmEntry_t* cur; - size_t bestMatchLength = 0; - U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); - - for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) { + for (cur = bucket; cur < bucket + entsPerBucket; cur++) { size_t curForwardMatchLength, curBackwardMatchLength, curTotalMatchLength; if (cur->checksum != checksum || cur->offset <= lowestIndex) { @@ -342,31 +414,23 @@ static size_t ZSTD_ldm_generateSequences_internal( cur->offset < dictLimit ? dictEnd : iend; BYTE const* const lowMatchPtr = cur->offset < dictLimit ? dictStart : lowPrefixPtr; - - curForwardMatchLength = ZSTD_count_2segments( - ip, pMatch, iend, - matchEnd, lowPrefixPtr); + curForwardMatchLength = + ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr); if (curForwardMatchLength < minMatchLength) { continue; } - curBackwardMatchLength = - ZSTD_ldm_countBackwardsMatch_2segments(ip, anchor, - pMatch, lowMatchPtr, - dictStart, dictEnd); - curTotalMatchLength = curForwardMatchLength + - curBackwardMatchLength; + curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments( + split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd); } else { /* !extDict */ BYTE const* const pMatch = base + cur->offset; - curForwardMatchLength = ZSTD_count(ip, pMatch, iend); + curForwardMatchLength = ZSTD_count(split, pMatch, iend); if (curForwardMatchLength < minMatchLength) { continue; } curBackwardMatchLength = - ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, - lowPrefixPtr); - curTotalMatchLength = curForwardMatchLength + - curBackwardMatchLength; + ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr); } + curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength; if (curTotalMatchLength > bestMatchLength) { bestMatchLength = curTotalMatchLength; @@ -375,57 +439,54 @@ static size_t ZSTD_ldm_generateSequences_internal( bestEntry = cur; } } - } - - /* No match found -- continue searching */ - if (bestEntry == NULL) { - ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, - hBits, curr, - *params); - ip++; - continue; - } - /* Match found */ - mLength = forwardMatchLength + backwardMatchLength; - ip -= backwardMatchLength; + /* No match found -- insert an entry into the hash table + * and process the next candidate match */ + if (bestEntry == NULL) { + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); + continue; + } - { - /* Store the sequence: - * ip = curr - backwardMatchLength - * The match is at (bestEntry->offset - backwardMatchLength) - */ - U32 const matchIndex = bestEntry->offset; - U32 const offset = curr - matchIndex; - rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; - - /* Out of sequence storage */ - if (rawSeqStore->size == rawSeqStore->capacity) - return ERROR(dstSize_tooSmall); - seq->litLength = (U32)(ip - anchor); - seq->matchLength = (U32)mLength; - seq->offset = offset; - rawSeqStore->size++; - } + /* Match found */ + offset = (U32)(split - base) - bestEntry->offset; + mLength = forwardMatchLength + backwardMatchLength; + { + rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; + + /* Out of sequence storage */ + if (rawSeqStore->size == rawSeqStore->capacity) + return ERROR(dstSize_tooSmall); + seq->litLength = (U32)(split - backwardMatchLength - anchor); + seq->matchLength = (U32)mLength; + seq->offset = offset; + rawSeqStore->size++; + } - /* Insert the current entry into the hash table */ - ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits, - (U32)(lastHashed - base), - *params); + /* Insert the current entry into the hash table --- it must be + * done after the previous block to avoid clobbering bestEntry */ + ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); - assert(ip + backwardMatchLength == lastHashed); + anchor = split + forwardMatchLength; - /* Fill the hash table from lastHashed+1 to ip+mLength*/ - /* Heuristic: don't need to fill the entire table at end of block */ - if (ip + mLength <= ilimit) { - rollingHash = ZSTD_ldm_fillLdmHashTable( - ldmState, rollingHash, lastHashed, - ip + mLength, base, hBits, *params); - lastHashed = ip + mLength - 1; + /* If we find a match that ends after the data that we've hashed + * then we have a repeating, overlapping, pattern. E.g. all zeros. + * If one repetition of the pattern matches our `stopMask` then all + * repetitions will. We don't need to insert them all into out table, + * only the first one. So skip over overlapping matches. + * This is a major speed boost (20x) for compressing a single byte + * repeated, when that byte ends up in the table. + */ + if (anchor > ip + hashed) { + ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength); + /* Continue the outter loop at anchor (ip + hashed == anchor). */ + ip = anchor - hashed; + break; + } } - ip += mLength; - anchor = ip; + + ip += hashed; } + return iend - anchor; } @@ -474,7 +535,7 @@ size_t ZSTD_ldm_generateSequences( assert(chunkStart < iend); /* 1. Perform overflow correction if necessary. */ - if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { + if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) { U32 const ldmHSize = 1U << params->hashLog; U32 const correction = ZSTD_window_correctOverflow( &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); @@ -596,12 +657,13 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_useRowMatchFinderMode_e useRowMatchFinder, void const* src, size_t srcSize) { const ZSTD_compressionParameters* const cParams = &ms->cParams; unsigned const minMatch = cParams->minMatch; ZSTD_blockCompressor const blockCompressor = - ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms)); + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); /* Input bounds */ BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; @@ -620,7 +682,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, assert(rawSeqStore->pos <= rawSeqStore->size); assert(rawSeqStore->size <= rawSeqStore->capacity); - /* Loop through each sequence and apply the block compressor to the lits */ + /* Loop through each sequence and apply the block compressor to the literals */ while (rawSeqStore->pos < rawSeqStore->size && ip < iend) { /* maybeSplitSequence updates rawSeqStore->pos */ rawSeq const sequence = maybeSplitSequence(rawSeqStore, diff --git a/zstd_ldm.h b/zstd_ldm.h index e95cfd1..700a98b 100644 --- a/zstd_ldm.h +++ b/zstd_ldm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -66,6 +66,7 @@ size_t ZSTD_ldm_generateSequences( */ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_useRowMatchFinderMode_e useRowMatchFinder, void const* src, size_t srcSize); /** @@ -73,7 +74,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, * * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`. * Avoids emitting matches less than `minMatch` bytes. - * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + * Must be called for data that is not passed to ZSTD_ldm_blockCompress(). */ void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch); diff --git a/zstd_ldm_geartab.h b/zstd_ldm_geartab.h new file mode 100644 index 0000000..2b15c02 --- /dev/null +++ b/zstd_ldm_geartab.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LDM_GEARTAB_H +#define ZSTD_LDM_GEARTAB_H + +#include "compiler.h" /* UNUSED_ATTR */ +#include "mem.h" /* U64 */ + +static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = { + 0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc, + 0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05, + 0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e, + 0x9c8528f65badeaca, 0x86563706e2097529, 0x2902475fa375d889, + 0xafb32a9739a5ebe6, 0xce2714da3883e639, 0x21eaf821722e69e, + 0x37b628620b628, 0x49a8d455d88caf5, 0x8556d711e6958140, + 0x4f7ae74fc605c1f, 0x829f0c3468bd3a20, 0x4ffdc885c625179e, + 0x8473de048a3daf1b, 0x51008822b05646b2, 0x69d75d12b2d1cc5f, + 0x8c9d4a19159154bc, 0xc3cc10f4abbd4003, 0xd06ddc1cecb97391, + 0xbe48e6e7ed80302e, 0x3481db31cee03547, 0xacc3f67cdaa1d210, + 0x65cb771d8c7f96cc, 0x8eb27177055723dd, 0xc789950d44cd94be, + 0x934feadc3700b12b, 0x5e485f11edbdf182, 0x1e2e2a46fd64767a, + 0x2969ca71d82efa7c, 0x9d46e9935ebbba2e, 0xe056b67e05e6822b, + 0x94d73f55739d03a0, 0xcd7010bdb69b5a03, 0x455ef9fcd79b82f4, + 0x869cb54a8749c161, 0x38d1a4fa6185d225, 0xb475166f94bbe9bb, + 0xa4143548720959f1, 0x7aed4780ba6b26ba, 0xd0ce264439e02312, + 0x84366d746078d508, 0xa8ce973c72ed17be, 0x21c323a29a430b01, + 0x9962d617e3af80ee, 0xab0ce91d9c8cf75b, 0x530e8ee6d19a4dbc, + 0x2ef68c0cf53f5d72, 0xc03a681640a85506, 0x496e4e9f9c310967, + 0x78580472b59b14a0, 0x273824c23b388577, 0x66bf923ad45cb553, + 0x47ae1a5a2492ba86, 0x35e304569e229659, 0x4765182a46870b6f, + 0x6cbab625e9099412, 0xddac9a2e598522c1, 0x7172086e666624f2, + 0xdf5003ca503b7837, 0x88c0c1db78563d09, 0x58d51865acfc289d, + 0x177671aec65224f1, 0xfb79d8a241e967d7, 0x2be1e101cad9a49a, + 0x6625682f6e29186b, 0x399553457ac06e50, 0x35dffb4c23abb74, + 0x429db2591f54aade, 0xc52802a8037d1009, 0x6acb27381f0b25f3, + 0xf45e2551ee4f823b, 0x8b0ea2d99580c2f7, 0x3bed519cbcb4e1e1, + 0xff452823dbb010a, 0x9d42ed614f3dd267, 0x5b9313c06257c57b, + 0xa114b8008b5e1442, 0xc1fe311c11c13d4b, 0x66e8763ea34c5568, + 0x8b982af1c262f05d, 0xee8876faaa75fbb7, 0x8a62a4d0d172bb2a, + 0xc13d94a3b7449a97, 0x6dbbba9dc15d037c, 0xc786101f1d92e0f1, + 0xd78681a907a0b79b, 0xf61aaf2962c9abb9, 0x2cfd16fcd3cb7ad9, + 0x868c5b6744624d21, 0x25e650899c74ddd7, 0xba042af4a7c37463, + 0x4eb1a539465a3eca, 0xbe09dbf03b05d5ca, 0x774e5a362b5472ba, + 0x47a1221229d183cd, 0x504b0ca18ef5a2df, 0xdffbdfbde2456eb9, + 0x46cd2b2fbee34634, 0xf2aef8fe819d98c3, 0x357f5276d4599d61, + 0x24a5483879c453e3, 0x88026889192b4b9, 0x28da96671782dbec, + 0x4ef37c40588e9aaa, 0x8837b90651bc9fb3, 0xc164f741d3f0e5d6, + 0xbc135a0a704b70ba, 0x69cd868f7622ada, 0xbc37ba89e0b9c0ab, + 0x47c14a01323552f6, 0x4f00794bacee98bb, 0x7107de7d637a69d5, + 0x88af793bb6f2255e, 0xf3c6466b8799b598, 0xc288c616aa7f3b59, + 0x81ca63cf42fca3fd, 0x88d85ace36a2674b, 0xd056bd3792389e7, + 0xe55c396c4e9dd32d, 0xbefb504571e6c0a6, 0x96ab32115e91e8cc, + 0xbf8acb18de8f38d1, 0x66dae58801672606, 0x833b6017872317fb, + 0xb87c16f2d1c92864, 0xdb766a74e58b669c, 0x89659f85c61417be, + 0xc8daad856011ea0c, 0x76a4b565b6fe7eae, 0xa469d085f6237312, + 0xaaf0365683a3e96c, 0x4dbb746f8424f7b8, 0x638755af4e4acc1, + 0x3d7807f5bde64486, 0x17be6d8f5bbb7639, 0x903f0cd44dc35dc, + 0x67b672eafdf1196c, 0xa676ff93ed4c82f1, 0x521d1004c5053d9d, + 0x37ba9ad09ccc9202, 0x84e54d297aacfb51, 0xa0b4b776a143445, + 0x820d471e20b348e, 0x1874383cb83d46dc, 0x97edeec7a1efe11c, + 0xb330e50b1bdc42aa, 0x1dd91955ce70e032, 0xa514cdb88f2939d5, + 0x2791233fd90db9d3, 0x7b670a4cc50f7a9b, 0x77c07d2a05c6dfa5, + 0xe3778b6646d0a6fa, 0xb39c8eda47b56749, 0x933ed448addbef28, + 0xaf846af6ab7d0bf4, 0xe5af208eb666e49, 0x5e6622f73534cd6a, + 0x297daeca42ef5b6e, 0x862daef3d35539a6, 0xe68722498f8e1ea9, + 0x981c53093dc0d572, 0xfa09b0bfbf86fbf5, 0x30b1e96166219f15, + 0x70e7d466bdc4fb83, 0x5a66736e35f2a8e9, 0xcddb59d2b7c1baef, + 0xd6c7d247d26d8996, 0xea4e39eac8de1ba3, 0x539c8bb19fa3aff2, + 0x9f90e4c5fd508d8, 0xa34e5956fbaf3385, 0x2e2f8e151d3ef375, + 0x173691e9b83faec1, 0xb85a8d56bf016379, 0x8382381267408ae3, + 0xb90f901bbdc0096d, 0x7c6ad32933bcec65, 0x76bb5e2f2c8ad595, + 0x390f851a6cf46d28, 0xc3e6064da1c2da72, 0xc52a0c101cfa5389, + 0xd78eaf84a3fbc530, 0x3781b9e2288b997e, 0x73c2f6dea83d05c4, + 0x4228e364c5b5ed7, 0x9d7a3edf0da43911, 0x8edcfeda24686756, + 0x5e7667a7b7a9b3a1, 0x4c4f389fa143791d, 0xb08bc1023da7cddc, + 0x7ab4be3ae529b1cc, 0x754e6132dbe74ff9, 0x71635442a839df45, + 0x2f6fb1643fbe52de, 0x961e0a42cf7a8177, 0xf3b45d83d89ef2ea, + 0xee3de4cf4a6e3e9b, 0xcd6848542c3295e7, 0xe4cee1664c78662f, + 0x9947548b474c68c4, 0x25d73777a5ed8b0b, 0xc915b1d636b7fc, + 0x21c2ba75d9b0d2da, 0x5f6b5dcf608a64a1, 0xdcf333255ff9570c, + 0x633b922418ced4ee, 0xc136dde0b004b34a, 0x58cc83b05d4b2f5a, + 0x5eb424dda28e42d2, 0x62df47369739cd98, 0xb4e0b42485e4ce17, + 0x16e1f0c1f9a8d1e7, 0x8ec3916707560ebf, 0x62ba6e2df2cc9db3, + 0xcbf9f4ff77d83a16, 0x78d9d7d07d2bbcc4, 0xef554ce1e02c41f4, + 0x8d7581127eccf94d, 0xa9b53336cb3c8a05, 0x38c42c0bf45c4f91, + 0x640893cdf4488863, 0x80ec34bc575ea568, 0x39f324f5b48eaa40, + 0xe9d9ed1f8eff527f, 0x9224fc058cc5a214, 0xbaba00b04cfe7741, + 0x309a9f120fcf52af, 0xa558f3ec65626212, 0x424bec8b7adabe2f, + 0x41622513a6aea433, 0xb88da2d5324ca798, 0xd287733b245528a4, + 0x9a44697e6d68aec3, 0x7b1093be2f49bb28, 0x50bbec632e3d8aad, + 0x6cd90723e1ea8283, 0x897b9e7431b02bf3, 0x219efdcb338a7047, + 0x3b0311f0a27c0656, 0xdb17bf91c0db96e7, 0x8cd4fd6b4e85a5b2, + 0xfab071054ba6409d, 0x40d6fe831fa9dfd9, 0xaf358debad7d791e, + 0xeb8d0e25a65e3e58, 0xbbcbd3df14e08580, 0xcf751f27ecdab2b, + 0x2b4da14f2613d8f4 +}; + +#endif /* ZSTD_LDM_GEARTAB_H */ diff --git a/zstd_legacy.h b/zstd_legacy.h index ecd9682..6ba3637 100644 --- a/zstd_legacy.h +++ b/zstd_legacy.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_opt.c b/zstd_opt.c index e55c459..301f985 100644 --- a/zstd_opt.c +++ b/zstd_opt.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. + * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -126,7 +126,7 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, optPtr->litSum = 0; for (lit=0; lit<=MaxLit; lit++) { U32 const scaleLog = 11; /* scale to 2K */ - U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit); + U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit); assert(bitCost <= scaleLog); optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; optPtr->litSum += optPtr->litFreq[lit]; @@ -253,11 +253,10 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP * Provides the cost of the match part (offset + matchLength) of a sequence * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ -FORCE_INLINE_TEMPLATE U32 -ZSTD_getMatchPrice(U32 const offset, - U32 const matchLength, - const optState_t* const optPtr, - int const optLevel) +static U32 ZSTD_getMatchPrice(U32 const offset, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) { U32 price; U32 const offCode = ZSTD_highbit32(offset+1); @@ -364,11 +363,13 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, * Binary Tree search ***************************************/ /** ZSTD_insertBt1() : add one or multiple positions to tree. - * ip : assumed <= iend-8 . + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position * @return : nb of positions added */ static U32 ZSTD_insertBt1( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, + U32 const target, U32 const mls, const int extDict) { const ZSTD_compressionParameters* const cParams = &ms->cParams; @@ -391,7 +392,10 @@ static U32 ZSTD_insertBt1( U32* smallerPtr = bt + 2*(curr&btMask); U32* largerPtr = smallerPtr + 1; U32 dummy32; /* to be nullified at the end */ - U32 const windowLow = ms->window.lowLimit; + /* windowLow is based on target because we're only need positions that will be + * in the window at the end of the tree update. + */ + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog); U32 matchEndIdx = curr+8+1; size_t bestLength = 8; U32 nbCompares = 1U << cParams->searchLog; @@ -404,6 +408,7 @@ static U32 ZSTD_insertBt1( DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr); + assert(curr <= target); assert(ip <= iend-8); /* required for h calculation */ hashTable[h] = curr; /* Update Hash Table */ @@ -479,11 +484,9 @@ static U32 ZSTD_insertBt1( } } -FORCE_INLINE_TEMPLATE -void ZSTD_updateTree_internal( - ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iend, - const U32 mls, const ZSTD_dictMode_e dictMode) +static void ZSTD_updateTree_internal(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + const U32 mls, const ZSTD_dictMode_e dictMode) { const BYTE* const base = ms->window.base; U32 const target = (U32)(ip - base); @@ -492,7 +495,7 @@ void ZSTD_updateTree_internal( idx, target, dictMode); while(idx < target) { - U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict); + U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict); assert(idx < (U32)(idx + forward)); idx += forward; } @@ -505,8 +508,7 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); } -FORCE_INLINE_TEMPLATE -U32 ZSTD_insertBtAndGetAllMatches ( +static U32 ZSTD_insertBtAndGetAllMatches ( ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ZSTD_matchState_t* ms, U32* nextToUpdate3, @@ -739,7 +741,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( } -FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( +static U32 ZSTD_BtGetAllMatches ( ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */ ZSTD_matchState_t* ms, U32* nextToUpdate3, @@ -893,7 +895,7 @@ static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_ */ U32 posOvershoot = currPosInBlock - optLdm->endPosInBlock; ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot); - } + } ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); } ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock); @@ -926,7 +928,7 @@ listStats(const U32* table, int lastEltID) #endif -FORCE_INLINE_TEMPLATE size_t +static size_t ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], diff --git a/zstd_opt.h b/zstd_opt.h index 9aba8a9..627255f 100644 --- a/zstd_opt.h +++ b/zstd_opt.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_trace.h b/zstd_trace.h new file mode 100644 index 0000000..f9121f7 --- /dev/null +++ b/zstd_trace.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_TRACE_H +#define ZSTD_TRACE_H + +#if defined (__cplusplus) +extern "C" { +#endif + +#include + +/* weak symbol support + * For now, enable conservatively: + * - Only GNUC + * - Only ELF + * - Only x86-64 and i386 + * Also, explicitly disable on platforms known not to work so they aren't + * forgotten in the future. + */ +#if !defined(ZSTD_HAVE_WEAK_SYMBOLS) && \ + defined(__GNUC__) && defined(__ELF__) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) && \ + !defined(__APPLE__) && !defined(_WIN32) && !defined(__MINGW32__) && \ + !defined(__CYGWIN__) && !defined(_AIX) +# define ZSTD_HAVE_WEAK_SYMBOLS 1 +#else +# define ZSTD_HAVE_WEAK_SYMBOLS 0 +#endif +#if ZSTD_HAVE_WEAK_SYMBOLS +# define ZSTD_WEAK_ATTR __attribute__((__weak__)) +#else +# define ZSTD_WEAK_ATTR +#endif + +/* Only enable tracing when weak symbols are available. */ +#ifndef ZSTD_TRACE +# define ZSTD_TRACE ZSTD_HAVE_WEAK_SYMBOLS +#endif + +#if ZSTD_TRACE + +struct ZSTD_CCtx_s; +struct ZSTD_DCtx_s; +struct ZSTD_CCtx_params_s; + +typedef struct { + /** + * ZSTD_VERSION_NUMBER + * + * This is guaranteed to be the first member of ZSTD_trace. + * Otherwise, this struct is not stable between versions. If + * the version number does not match your expectation, you + * should not interpret the rest of the struct. + */ + unsigned version; + /** + * Non-zero if streaming (de)compression is used. + */ + unsigned streaming; + /** + * The dictionary ID. + */ + unsigned dictionaryID; + /** + * Is the dictionary cold? + * Only set on decompression. + */ + unsigned dictionaryIsCold; + /** + * The dictionary size or zero if no dictionary. + */ + size_t dictionarySize; + /** + * The uncompressed size of the data. + */ + size_t uncompressedSize; + /** + * The compressed size of the data. + */ + size_t compressedSize; + /** + * The fully resolved CCtx parameters (NULL on decompression). + */ + struct ZSTD_CCtx_params_s const* params; + /** + * The ZSTD_CCtx pointer (NULL on decompression). + */ + struct ZSTD_CCtx_s const* cctx; + /** + * The ZSTD_DCtx pointer (NULL on compression). + */ + struct ZSTD_DCtx_s const* dctx; +} ZSTD_Trace; + +/** + * A tracing context. It must be 0 when tracing is disabled. + * Otherwise, any non-zero value returned by a tracing begin() + * function is presented to any subsequent calls to end(). + * + * Any non-zero value is treated as tracing is enabled and not + * interpreted by the library. + * + * Two possible uses are: + * * A timestamp for when the begin() function was called. + * * A unique key identifying the (de)compression, like the + * address of the [dc]ctx pointer if you need to track + * more information than just a timestamp. + */ +typedef unsigned long long ZSTD_TraceCtx; + +/** + * Trace the beginning of a compression call. + * @param cctx The dctx pointer for the compression. + * It can be used as a key to map begin() to end(). + * @returns Non-zero if tracing is enabled. The return value is + * passed to ZSTD_trace_compress_end(). + */ +ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_compress_begin( + struct ZSTD_CCtx_s const* cctx); + +/** + * Trace the end of a compression call. + * @param ctx The return value of ZSTD_trace_compress_begin(). + * @param trace The zstd tracing info. + */ +ZSTD_WEAK_ATTR void ZSTD_trace_compress_end( + ZSTD_TraceCtx ctx, + ZSTD_Trace const* trace); + +/** + * Trace the beginning of a decompression call. + * @param dctx The dctx pointer for the decompression. + * It can be used as a key to map begin() to end(). + * @returns Non-zero if tracing is enabled. The return value is + * passed to ZSTD_trace_compress_end(). + */ +ZSTD_WEAK_ATTR ZSTD_TraceCtx ZSTD_trace_decompress_begin( + struct ZSTD_DCtx_s const* dctx); + +/** + * Trace the end of a decompression call. + * @param ctx The return value of ZSTD_trace_decompress_begin(). + * @param trace The zstd tracing info. + */ +ZSTD_WEAK_ATTR void ZSTD_trace_decompress_end( + ZSTD_TraceCtx ctx, + ZSTD_Trace const* trace); + +#endif /* ZSTD_TRACE */ + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_TRACE_H */ diff --git a/zstd_v01.c b/zstd_v01.c index 3eb34eb..d6e1920 100644 --- a/zstd_v01.c +++ b/zstd_v01.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v01.h b/zstd_v01.h index 7910351..f777eb6 100644 --- a/zstd_v01.h +++ b/zstd_v01.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v02.c b/zstd_v02.c index 3bb580f..2091ee9 100644 --- a/zstd_v02.c +++ b/zstd_v02.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v02.h b/zstd_v02.h index 5f8f6cd..1b37195 100644 --- a/zstd_v02.h +++ b/zstd_v02.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v03.c b/zstd_v03.c index 7dedf03..272d6da 100644 --- a/zstd_v03.c +++ b/zstd_v03.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v03.h b/zstd_v03.h index 5fc7273..7a00d43 100644 --- a/zstd_v03.h +++ b/zstd_v03.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v04.c b/zstd_v04.c index 7abb698..ad07e33 100644 --- a/zstd_v04.c +++ b/zstd_v04.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v04.h b/zstd_v04.h index 15fce0d..66b97ab 100644 --- a/zstd_v04.h +++ b/zstd_v04.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v05.c b/zstd_v05.c index b2053be..70a01ee 100644 --- a/zstd_v05.c +++ b/zstd_v05.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -2833,7 +2833,7 @@ static size_t ZSTDv05_decodeFrameHeader_Part2(ZSTDv05_DCtx* zc, const void* src, static size_t ZSTDv05_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { - const BYTE* const in = (const BYTE* const)src; + const BYTE* const in = (const BYTE*)src; BYTE headerFlags; U32 cSize; @@ -3002,7 +3002,7 @@ static size_t ZSTDv05_decodeSeqHeaders(int* nbSeq, const BYTE** dumpsPtr, size_t FSEv05_DTable* DTableLL, FSEv05_DTable* DTableML, FSEv05_DTable* DTableOffb, const void* src, size_t srcSize, U32 flagStaticTable) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* ip = istart; const BYTE* const iend = istart + srcSize; U32 LLtype, Offtype, MLtype; @@ -3310,7 +3310,7 @@ static size_t ZSTDv05_decompressSequences( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; BYTE* const oend = ostart + maxDstSize; size_t errorCode, dumpsLength=0; @@ -3423,7 +3423,7 @@ static size_t ZSTDv05_decompress_continueDCtx(ZSTDv05_DCtx* dctx, { const BYTE* ip = (const BYTE*)src; const BYTE* iend = ip + srcSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; BYTE* const oend = ostart + maxDstSize; size_t remainingSize = srcSize; diff --git a/zstd_v05.h b/zstd_v05.h index 6c46e8f..d555175 100644 --- a/zstd_v05.h +++ b/zstd_v05.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v06.c b/zstd_v06.c index cf17623..3b684b6 100644 --- a/zstd_v06.c +++ b/zstd_v06.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -3029,7 +3029,7 @@ typedef struct * Provides the size of compressed block from block header `src` */ static size_t ZSTDv06_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { - const BYTE* const in = (const BYTE* const)src; + const BYTE* const in = (const BYTE*)src; U32 cSize; if (srcSize < ZSTDv06_blockHeaderSize) return ERROR(srcSize_wrong); @@ -3223,7 +3223,7 @@ static size_t ZSTDv06_decodeSeqHeaders(int* nbSeqPtr, FSEv06_DTable* DTableLL, FSEv06_DTable* DTableML, FSEv06_DTable* DTableOffb, U32 flagRepeatTable, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; @@ -3445,7 +3445,7 @@ static size_t ZSTDv06_decompressSequences( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; @@ -3561,7 +3561,7 @@ static size_t ZSTDv06_decompressFrame(ZSTDv06_DCtx* dctx, { const BYTE* ip = (const BYTE*)src; const BYTE* const iend = ip + srcSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* op = ostart; BYTE* const oend = ostart + dstCapacity; size_t remainingSize = srcSize; diff --git a/zstd_v06.h b/zstd_v06.h index 2fd99e6..9e32b76 100644 --- a/zstd_v06.h +++ b/zstd_v06.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstd_v07.c b/zstd_v07.c index f486eb9..c87b033 100644 --- a/zstd_v07.c +++ b/zstd_v07.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -3258,7 +3258,7 @@ typedef struct * Provides the size of compressed block from block header `src` */ static size_t ZSTDv07_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { - const BYTE* const in = (const BYTE* const)src; + const BYTE* const in = (const BYTE*)src; U32 cSize; if (srcSize < ZSTDv07_blockHeaderSize) return ERROR(srcSize_wrong); @@ -3453,7 +3453,7 @@ static size_t ZSTDv07_decodeSeqHeaders(int* nbSeqPtr, FSEv07_DTable* DTableLL, FSEv07_DTable* DTableML, FSEv07_DTable* DTableOffb, U32 flagRepeatTable, const void* src, size_t srcSize) { - const BYTE* const istart = (const BYTE* const)src; + const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; @@ -3672,7 +3672,7 @@ static size_t ZSTDv07_decompressSequences( { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; @@ -3799,7 +3799,7 @@ static size_t ZSTDv07_decompressFrame(ZSTDv07_DCtx* dctx, { const BYTE* ip = (const BYTE*)src; const BYTE* const iend = ip + srcSize; - BYTE* const ostart = (BYTE* const)dst; + BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstCapacity; BYTE* op = ostart; size_t remainingSize = srcSize; diff --git a/zstd_v07.h b/zstd_v07.h index 9da50c4..bc35cfa 100644 --- a/zstd_v07.h +++ b/zstd_v07.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the diff --git a/zstdmt_compress.c b/zstdmt_compress.c index 084e48b..8e5dd15 100644 --- a/zstdmt_compress.c +++ b/zstdmt_compress.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -472,8 +472,6 @@ ZSTDMT_serialState_reset(serialState_t* serialState, ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); assert(params.ldmParams.hashRateLog < 32); - serialState->ldmState.hashPower = - ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength); } else { ZSTD_memset(¶ms.ldmParams, 0, sizeof(params.ldmParams)); } @@ -486,10 +484,10 @@ ZSTDMT_serialState_reset(serialState_t* serialState, size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t); unsigned const bucketLog = params.ldmParams.hashLog - params.ldmParams.bucketSizeLog; - size_t const bucketSize = (size_t)1 << bucketLog; unsigned const prevBucketLog = serialState->params.ldmParams.hashLog - serialState->params.ldmParams.bucketSizeLog; + size_t const numBuckets = (size_t)1 << bucketLog; /* Size the seq pool tables */ ZSTDMT_setNbSeq(seqPool, ZSTD_ldm_getMaxNbSeq(params.ldmParams, jobSize)); /* Reset the window */ @@ -501,20 +499,20 @@ ZSTDMT_serialState_reset(serialState_t* serialState, } if (serialState->ldmState.bucketOffsets == NULL || prevBucketLog < bucketLog) { ZSTD_customFree(serialState->ldmState.bucketOffsets, cMem); - serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(bucketSize, cMem); + serialState->ldmState.bucketOffsets = (BYTE*)ZSTD_customMalloc(numBuckets, cMem); } if (!serialState->ldmState.hashTable || !serialState->ldmState.bucketOffsets) return 1; /* Zero the tables */ ZSTD_memset(serialState->ldmState.hashTable, 0, hashSize); - ZSTD_memset(serialState->ldmState.bucketOffsets, 0, bucketSize); + ZSTD_memset(serialState->ldmState.bucketOffsets, 0, numBuckets); /* Update window state and fill hash table with dict */ serialState->ldmState.loadedDictEnd = 0; if (dictSize > 0) { if (dictContentType == ZSTD_dct_rawContent) { BYTE const* const dictEnd = (const BYTE*)dict + dictSize; - ZSTD_window_update(&serialState->ldmState.window, dict, dictSize); + ZSTD_window_update(&serialState->ldmState.window, dict, dictSize, /* forceNonContiguous */ 0); ZSTD_ldm_fillHashTable(&serialState->ldmState, (const BYTE*)dict, dictEnd, ¶ms.ldmParams); serialState->ldmState.loadedDictEnd = params.forceWindow ? 0 : (U32)(dictEnd - serialState->ldmState.window.base); } else { @@ -571,7 +569,7 @@ static void ZSTDMT_serialState_update(serialState_t* serialState, assert(seqStore.seq != NULL && seqStore.pos == 0 && seqStore.size == 0 && seqStore.capacity > 0); assert(src.size <= serialState->params.jobSize); - ZSTD_window_update(&serialState->ldmState.window, src.start, src.size); + ZSTD_window_update(&serialState->ldmState.window, src.start, src.size, /* forceNonContiguous */ 0); error = ZSTD_ldm_generateSequences( &serialState->ldmState, &seqStore, &serialState->params.ldmParams, src.start, src.size); @@ -683,6 +681,8 @@ static void ZSTDMT_compressionJob(void* jobDescription) if (job->jobID != 0) jobParams.fParams.checksumFlag = 0; /* Don't run LDM for the chunks, since we handle it externally */ jobParams.ldmParams.enableLdm = 0; + /* Correct nbWorkers to 0. */ + jobParams.nbWorkers = 0; /* init */ @@ -695,6 +695,10 @@ static void ZSTDMT_compressionJob(void* jobDescription) { size_t const forceWindowError = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_forceMaxWindow, !job->firstJob); if (ZSTD_isError(forceWindowError)) JOB_ERROR(forceWindowError); } + if (!job->firstJob) { + size_t const err = ZSTD_CCtxParams_setParameter(&jobParams, ZSTD_c_deterministicRefPrefix, 0); + if (ZSTD_isError(err)) JOB_ERROR(err); + } { size_t const initError = ZSTD_compressBegin_advanced_internal(cctx, job->prefix.start, job->prefix.size, ZSTD_dct_rawContent, /* load dictionary in "content-only" mode (no header analysis) */ ZSTD_dtlm_fast, @@ -750,6 +754,13 @@ static void ZSTDMT_compressionJob(void* jobDescription) if (ZSTD_isError(cSize)) JOB_ERROR(cSize); lastCBlockSize = cSize; } } + if (!job->firstJob) { + /* Double check that we don't have an ext-dict, because then our + * repcode invalidation doesn't work. + */ + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); + } + ZSTD_CCtx_trace(cctx, 0); _endJob: ZSTDMT_serialState_ensureFinished(job->serial, job->jobID, job->cSize); @@ -1239,9 +1250,8 @@ size_t ZSTDMT_initCStream_internal( if (params.rsyncable) { /* Aim for the targetsectionSize as the average job size. */ - U32 const jobSizeMB = (U32)(mtctx->targetSectionSize >> 20); - U32 const rsyncBits = ZSTD_highbit32(jobSizeMB) + 20; - assert(jobSizeMB >= 1); + U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10); + U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10); DEBUGLOG(4, "rsyncLog = %u", rsyncBits); mtctx->rsync.hash = 0; mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1; diff --git a/zstdmt_compress.h b/zstdmt_compress.h index e306964..5b9565f 100644 --- a/zstdmt_compress.h +++ b/zstdmt_compress.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the @@ -32,11 +32,11 @@ /* === Constants === */ -#ifndef ZSTDMT_NBWORKERS_MAX -# define ZSTDMT_NBWORKERS_MAX 200 +#ifndef ZSTDMT_NBWORKERS_MAX /* a different value can be selected at compile time */ +# define ZSTDMT_NBWORKERS_MAX ((sizeof(void*)==4) /*32-bit*/ ? 64 : 256) #endif -#ifndef ZSTDMT_JOBSIZE_MIN -# define ZSTDMT_JOBSIZE_MIN (1 MB) +#ifndef ZSTDMT_JOBSIZE_MIN /* a different value can be selected at compile time */ +# define ZSTDMT_JOBSIZE_MIN (512 KB) #endif #define ZSTDMT_JOBLOG_MAX (MEM_32bits() ? 29 : 30) #define ZSTDMT_JOBSIZE_MAX (MEM_32bits() ? (512 MB) : (1024 MB))