From 62c92fc6d2a7a4956db68ba8353be72a0698b6d2 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Tue, 29 Oct 2024 13:41:23 +0800 Subject: [PATCH] Remove unused XKCP symbols KeccakP-1600-times4-SIMD256.c is copied from XKCP and contains a lot of symbols that we are not actually using. This commit trims it down to just KeccakP1600times4_PermuteAll_24rounds. Signed-off-by: Matthias J. Kannwischer --- .../x86_64/xkcp/KeccakP-1600-times4-SIMD256.c | 659 ------------------ .../x86_64/xkcp/KeccakP-1600-times4-SnP.h | 88 +-- .../x86_64/xkcp/KeccakP-1600-unrolling.macros | 156 ----- 3 files changed, 1 insertion(+), 902 deletions(-) diff --git a/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SIMD256.c b/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SIMD256.c index 7a125e678..17a1eb1c0 100644 --- a/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SIMD256.c +++ b/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SIMD256.c @@ -40,22 +40,13 @@ and related or neighboring rights to the source code in this file. #error Expecting a little-endian platform #endif -typedef unsigned char UINT8; typedef unsigned long long int UINT64; -typedef __m128i V128; typedef __m256i V256; -#define laneIndex(instanceIndex, lanePosition) \ - ((lanePosition) * 4 + (instanceIndex)) - #if defined(KeccakP1600times4_useAVX2) #define ANDnu256(a, b) _mm256_andnot_si256(a, b) #define CONST256(a) _mm256_load_si256((const V256 *)&(a)) #define CONST256_64(a) (V256) _mm256_broadcast_sd((const double *)(&a)) -#define LOAD256(a) _mm256_load_si256((const V256 *)&(a)) -#define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a)) -#define LOAD4_64(a, b, c, d) \ - _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d)) #define ROL64in256(d, a, o) \ d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64 - (o))) #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) @@ -65,435 +56,12 @@ static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b) -#define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b) -#define STORE2_128(ah, al, v) \ - _mm256_storeu2_m128d((V128 *)&(ah), (V128 *)&(al), v) #define XOR256(a, b) _mm256_xor_si256(a, b) #define XOReq256(a, b) a = _mm256_xor_si256(a, b) -#define UNPACKL(a, b) _mm256_unpacklo_epi64((a), (b)) -#define UNPACKH(a, b) _mm256_unpackhi_epi64((a), (b)) -#define PERM128(a, b, c) \ - (V256) _mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) -#define SHUFFLE64(a, b, c) \ - (V256) _mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) - -#define UNINTLEAVE() \ - lanesL01 = UNPACKL(lanes0, lanes1), lanesH01 = UNPACKH(lanes0, lanes1), \ - lanesL23 = UNPACKL(lanes2, lanes3), lanesH23 = UNPACKH(lanes2, lanes3), \ - lanes0 = PERM128(lanesL01, lanesL23, 0x20), \ - lanes2 = PERM128(lanesL01, lanesL23, 0x31), \ - lanes1 = PERM128(lanesH01, lanesH23, 0x20), \ - lanes3 = PERM128(lanesH01, lanesH23, 0x31) - -#define INTLEAVE() \ - lanesL01 = PERM128(lanes0, lanes2, 0x20), \ - lanesH01 = PERM128(lanes1, lanes3, 0x20), \ - lanesL23 = PERM128(lanes0, lanes2, 0x31), \ - lanesH23 = PERM128(lanes1, lanes3, 0x31), \ - lanes0 = SHUFFLE64(lanesL01, lanesH01, 0x00), \ - lanes1 = SHUFFLE64(lanesL01, lanesH01, 0x0F), \ - lanes2 = SHUFFLE64(lanesL23, lanesH23, 0x00), \ - lanes3 = SHUFFLE64(lanesL23, lanesH23, 0x0F) - #endif #define SnP_laneLengthInBytes 8 -void KeccakP1600times4_InitializeAll(void *states) { - memset(states, 0, KeccakP1600times4_statesSizeInBytes); -} - -void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, - const unsigned char *data, unsigned int offset, - unsigned int length) { - unsigned int sizeLeft = length; - unsigned int lanePosition = offset / SnP_laneLengthInBytes; - unsigned int offsetInLane = offset % SnP_laneLengthInBytes; - const unsigned char *curData = data; - UINT64 *statesAsLanes = (UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - UINT64 lane = 0; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy((unsigned char *)&lane + offsetInLane, curData, bytesInLane); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while (sizeLeft >= SnP_laneLengthInBytes) { - UINT64 lane = *((const UINT64 *)curData); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - UINT64 lane = 0; - memcpy(&lane, curData, sizeLeft); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - } -} - -void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, - unsigned int laneCount, - unsigned int laneOffset) { - V256 *stateAsLanes = (V256 *)states; - unsigned int i; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = - (const UINT64 *)(data + laneOffset * SnP_laneLengthInBytes); - const UINT64 *curData2 = - (const UINT64 *)(data + laneOffset * 2 * SnP_laneLengthInBytes); - const UINT64 *curData3 = - (const UINT64 *)(data + laneOffset * 3 * SnP_laneLengthInBytes); - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - -#define Xor_In(argIndex) \ - XOReq256(stateAsLanes[argIndex], \ - LOAD4_64(curData3[argIndex], curData2[argIndex], \ - curData1[argIndex], curData0[argIndex])) - -#define Xor_In4(argIndex) \ - lanes0 = LOAD256u(curData0[argIndex]), \ - lanes1 = LOAD256u(curData1[argIndex]), \ - lanes2 = LOAD256u(curData2[argIndex]), \ - lanes3 = LOAD256u(curData3[argIndex]), INTLEAVE(), \ - XOReq256(stateAsLanes[argIndex + 0], lanes0), \ - XOReq256(stateAsLanes[argIndex + 1], lanes1), \ - XOReq256(stateAsLanes[argIndex + 2], lanes2), \ - XOReq256(stateAsLanes[argIndex + 3], lanes3) - - if (laneCount >= 16) { - Xor_In4(0); - Xor_In4(4); - Xor_In4(8); - Xor_In4(12); - if (laneCount >= 20) { - Xor_In4(16); - for (i = 20; i < laneCount; i++) - Xor_In(i); - } else { - for (i = 16; i < laneCount; i++) - Xor_In(i); - } - } else { - for (i = 0; i < laneCount; i++) - Xor_In(i); - } -#undef Xor_In -#undef Xor_In4 -} - -void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, - const unsigned char *data, - unsigned int offset, - unsigned int length) { - unsigned int sizeLeft = length; - unsigned int lanePosition = offset / SnP_laneLengthInBytes; - unsigned int offsetInLane = offset % SnP_laneLengthInBytes; - const unsigned char *curData = data; - UINT64 *statesAsLanes = (UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy(((unsigned char - *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + - offsetInLane, - curData, bytesInLane); - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while (sizeLeft >= SnP_laneLengthInBytes) { - UINT64 lane = *((const UINT64 *)curData); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, - sizeLeft); - } -} - -void KeccakP1600times4_OverwriteLanesAll(void *states, - const unsigned char *data, - unsigned int laneCount, - unsigned int laneOffset) { - V256 *stateAsLanes = (V256 *)states; - unsigned int i; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = - (const UINT64 *)(data + laneOffset * SnP_laneLengthInBytes); - const UINT64 *curData2 = - (const UINT64 *)(data + laneOffset * 2 * SnP_laneLengthInBytes); - const UINT64 *curData3 = - (const UINT64 *)(data + laneOffset * 3 * SnP_laneLengthInBytes); - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - -#define OverWr(argIndex) \ - STORE256(stateAsLanes[argIndex], \ - LOAD4_64(curData3[argIndex], curData2[argIndex], \ - curData1[argIndex], curData0[argIndex])) - -#define OverWr4(argIndex) \ - lanes0 = LOAD256u(curData0[argIndex]), \ - lanes1 = LOAD256u(curData1[argIndex]), \ - lanes2 = LOAD256u(curData2[argIndex]), \ - lanes3 = LOAD256u(curData3[argIndex]), INTLEAVE(), \ - STORE256(stateAsLanes[argIndex + 0], lanes0), \ - STORE256(stateAsLanes[argIndex + 1], lanes1), \ - STORE256(stateAsLanes[argIndex + 2], lanes2), \ - STORE256(stateAsLanes[argIndex + 3], lanes3) - - if (laneCount >= 16) { - OverWr4(0); - OverWr4(4); - OverWr4(8); - OverWr4(12); - if (laneCount >= 20) { - OverWr4(16); - for (i = 20; i < laneCount; i++) - OverWr(i); - } else { - for (i = 16; i < laneCount; i++) - OverWr(i); - } - } else { - for (i = 0; i < laneCount; i++) - OverWr(i); - } -#undef OverWr -#undef OverWr4 -} - -void KeccakP1600times4_OverwriteWithZeroes(void *states, - unsigned int instanceIndex, - unsigned int byteCount) { - unsigned int sizeLeft = byteCount; - unsigned int lanePosition = 0; - UINT64 *statesAsLanes = (UINT64 *)states; - - while (sizeLeft >= SnP_laneLengthInBytes) { - statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - } - - if (sizeLeft > 0) { - memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft); - } -} - -void KeccakP1600times4_ExtractBytes(const void *states, - unsigned int instanceIndex, - unsigned char *data, unsigned int offset, - unsigned int length) { - unsigned int sizeLeft = length; - unsigned int lanePosition = offset / SnP_laneLengthInBytes; - unsigned int offsetInLane = offset % SnP_laneLengthInBytes; - unsigned char *curData = data; - const UINT64 *statesAsLanes = (const UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy(curData, - ((unsigned char - *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + - offsetInLane, - bytesInLane); - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while (sizeLeft >= SnP_laneLengthInBytes) { - *(UINT64 *)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - memcpy(curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], - sizeLeft); - } -} - -void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, - unsigned int laneCount, - unsigned int laneOffset) { - UINT64 *curData0 = (UINT64 *)data; - UINT64 *curData1 = (UINT64 *)(data + laneOffset * 1 * SnP_laneLengthInBytes); - UINT64 *curData2 = (UINT64 *)(data + laneOffset * 2 * SnP_laneLengthInBytes); - UINT64 *curData3 = (UINT64 *)(data + laneOffset * 3 * SnP_laneLengthInBytes); - - const V256 *stateAsLanes = (const V256 *)states; - const UINT64 *stateAsLanes64 = (const UINT64 *)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - unsigned int i; - -#define Extr(argIndex) \ - curData0[argIndex] = stateAsLanes64[4 * (argIndex)], \ - curData1[argIndex] = stateAsLanes64[4 * (argIndex) + 1], \ - curData2[argIndex] = stateAsLanes64[4 * (argIndex) + 2], \ - curData3[argIndex] = stateAsLanes64[4 * (argIndex) + 3] - -#define Extr4(argIndex) \ - lanes0 = LOAD256(stateAsLanes[argIndex + 0]), \ - lanes1 = LOAD256(stateAsLanes[argIndex + 1]), \ - lanes2 = LOAD256(stateAsLanes[argIndex + 2]), \ - lanes3 = LOAD256(stateAsLanes[argIndex + 3]), UNINTLEAVE(), \ - STORE256u(curData0[argIndex], lanes0), \ - STORE256u(curData1[argIndex], lanes1), \ - STORE256u(curData2[argIndex], lanes2), STORE256u(curData3[argIndex], lanes3) - - if (laneCount >= 16) { - Extr4(0); - Extr4(4); - Extr4(8); - Extr4(12); - if (laneCount >= 20) { - Extr4(16); - for (i = 20; i < laneCount; i++) - Extr(i); - } else { - for (i = 16; i < laneCount; i++) - Extr(i); - } - } else { - for (i = 0; i < laneCount; i++) - Extr(i); - } -#undef Extr -#undef Extr4 -} - -void KeccakP1600times4_ExtractAndAddBytes( - const void *states, unsigned int instanceIndex, const unsigned char *input, - unsigned char *output, unsigned int offset, unsigned int length) { - unsigned int sizeLeft = length; - unsigned int lanePosition = offset / SnP_laneLengthInBytes; - unsigned int offsetInLane = offset % SnP_laneLengthInBytes; - const unsigned char *curInput = input; - unsigned char *curOutput = output; - const UINT64 *statesAsLanes = (const UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> - (8 * offsetInLane); - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - sizeLeft -= bytesInLane; - do { - *(curOutput++) = *(curInput++) ^ (unsigned char)lane; - lane >>= 8; - } while (--bytesInLane != 0); - lanePosition++; - } - - while (sizeLeft >= SnP_laneLengthInBytes) { - *((UINT64 *)curOutput) = - *((UINT64 *)curInput) ^ - statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curInput += SnP_laneLengthInBytes; - curOutput += SnP_laneLengthInBytes; - } - - if (sizeLeft != 0) { - UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - do { - *(curOutput++) = *(curInput++) ^ (unsigned char)lane; - lane >>= 8; - } while (--sizeLeft != 0); - } -} - -void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, - const unsigned char *input, - unsigned char *output, - unsigned int laneCount, - unsigned int laneOffset) { - const UINT64 *curInput0 = (UINT64 *)input; - const UINT64 *curInput1 = - (UINT64 *)(input + laneOffset * 1 * SnP_laneLengthInBytes); - const UINT64 *curInput2 = - (UINT64 *)(input + laneOffset * 2 * SnP_laneLengthInBytes); - const UINT64 *curInput3 = - (UINT64 *)(input + laneOffset * 3 * SnP_laneLengthInBytes); - UINT64 *curOutput0 = (UINT64 *)output; - UINT64 *curOutput1 = - (UINT64 *)(output + laneOffset * 1 * SnP_laneLengthInBytes); - UINT64 *curOutput2 = - (UINT64 *)(output + laneOffset * 2 * SnP_laneLengthInBytes); - UINT64 *curOutput3 = - (UINT64 *)(output + laneOffset * 3 * SnP_laneLengthInBytes); - - const V256 *stateAsLanes = (const V256 *)states; - const UINT64 *stateAsLanes64 = (const UINT64 *)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - unsigned int i; - -#define ExtrXor(argIndex) \ - curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4 * (argIndex)], \ - curOutput1[argIndex] = \ - curInput1[argIndex] ^ stateAsLanes64[4 * (argIndex) + 1], \ - curOutput2[argIndex] = \ - curInput2[argIndex] ^ stateAsLanes64[4 * (argIndex) + 2], \ - curOutput3[argIndex] = \ - curInput3[argIndex] ^ stateAsLanes64[4 * (argIndex) + 3] - -#define ExtrXor4(argIndex) \ - lanes0 = LOAD256(stateAsLanes[argIndex + 0]), \ - lanes1 = LOAD256(stateAsLanes[argIndex + 1]), \ - lanes2 = LOAD256(stateAsLanes[argIndex + 2]), \ - lanes3 = LOAD256(stateAsLanes[argIndex + 3]), UNINTLEAVE(), \ - lanesL01 = LOAD256u(curInput0[argIndex]), \ - lanesH01 = LOAD256u(curInput1[argIndex]), \ - lanesL23 = LOAD256u(curInput2[argIndex]), \ - lanesH23 = LOAD256u(curInput3[argIndex]), XOReq256(lanes0, lanesL01), \ - XOReq256(lanes1, lanesH01), XOReq256(lanes2, lanesL23), \ - XOReq256(lanes3, lanesH23), STORE256u(curOutput0[argIndex], lanes0), \ - STORE256u(curOutput1[argIndex], lanes1), \ - STORE256u(curOutput2[argIndex], lanes2), \ - STORE256u(curOutput3[argIndex], lanes3) - - if (laneCount >= 16) { - ExtrXor4(0); - ExtrXor4(4); - ExtrXor4(8); - ExtrXor4(12); - if (laneCount >= 20) { - ExtrXor4(16); - for (i = 20; i < laneCount; i++) - ExtrXor(i); - } else { - for (i = 16; i < laneCount; i++) - ExtrXor(i); - } - } else { - for (i = 0; i < laneCount; i++) - ExtrXor(i); - } -#undef ExtrXor -#undef ExtrXor4 -} - #define declareABCDE \ V256 Aba, Abe, Abi, Abo, Abu; \ V256 Aga, Age, Agi, Ago, Agu; \ @@ -866,233 +434,6 @@ void KeccakP1600times4_PermuteAll_24rounds(void *states) { copyFromState(A, statesAsLanes) rounds24 copyToState(statesAsLanes, A) } -void KeccakP1600times4_PermuteAll_12rounds(void *states) { - V256 *statesAsLanes = (V256 *)states; - declareABCDE -#ifndef KeccakP1600times4_fullUnrolling - unsigned int i; -#endif - - copyFromState(A, statesAsLanes) rounds12 copyToState(statesAsLanes, A) -} - -size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, - unsigned int laneOffsetParallel, - unsigned int laneOffsetSerial, - const unsigned char *data, - size_t dataByteLen) { - if (laneCount == 21) { -#if 0 - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - V256 *stateAsLanes = (V256 *)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; -#define Xor_In(argIndex) \ - XOReq256(stateAsLanes[argIndex], \ - LOAD4_64(curData3[argIndex], curData2[argIndex], \ - curData1[argIndex], curData0[argIndex])) -#define Xor_In4(argIndex) \ - lanes0 = LOAD256u(curData0[argIndex]), \ - lanes1 = LOAD256u(curData1[argIndex]), \ - lanes2 = LOAD256u(curData2[argIndex]), \ - lanes3 = LOAD256u(curData3[argIndex]), INTLEAVE(), \ - XOReq256(stateAsLanes[argIndex + 0], lanes0), \ - XOReq256(stateAsLanes[argIndex + 1], lanes1), \ - XOReq256(stateAsLanes[argIndex + 2], lanes2), \ - XOReq256(stateAsLanes[argIndex + 3], lanes3) - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - Xor_In4( 16 ); - Xor_In( 20 ); -#undef Xor_In -#undef Xor_In4 - KeccakP1600times4_PermuteAll_24rounds(states); - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - return (const unsigned char *)curData0 - dataStart; -#else - // unsigned int i; - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = - (const UINT64 *)(data + laneOffsetParallel * 1 * SnP_laneLengthInBytes); - const UINT64 *curData2 = - (const UINT64 *)(data + laneOffsetParallel * 2 * SnP_laneLengthInBytes); - const UINT64 *curData3 = - (const UINT64 *)(data + laneOffsetParallel * 3 * SnP_laneLengthInBytes); - V256 *statesAsLanes = (V256 *)states; - declareABCDE - - copyFromState(A, statesAsLanes) while ( - dataByteLen >= (laneOffsetParallel * 3 + laneCount) * 8) { -#define XOR_In(Xxx, argIndex) \ - XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], \ - curData1[argIndex], curData0[argIndex])) - XOR_In(Aba, 0); - XOR_In(Abe, 1); - XOR_In(Abi, 2); - XOR_In(Abo, 3); - XOR_In(Abu, 4); - XOR_In(Aga, 5); - XOR_In(Age, 6); - XOR_In(Agi, 7); - XOR_In(Ago, 8); - XOR_In(Agu, 9); - XOR_In(Aka, 10); - XOR_In(Ake, 11); - XOR_In(Aki, 12); - XOR_In(Ako, 13); - XOR_In(Aku, 14); - XOR_In(Ama, 15); - XOR_In(Ame, 16); - XOR_In(Ami, 17); - XOR_In(Amo, 18); - XOR_In(Amu, 19); - XOR_In(Asa, 20); -#undef XOR_In - rounds24 curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial * 8; - } - copyToState(statesAsLanes, A) return (const unsigned char *)curData0 - - dataStart; -#endif - } else { - // unsigned int i; - const unsigned char *dataStart = data; - - while (dataByteLen >= (laneOffsetParallel * 3 + laneCount) * 8) { - KeccakP1600times4_AddLanesAll(states, data, laneCount, - laneOffsetParallel); - KeccakP1600times4_PermuteAll_24rounds(states); - data += laneOffsetSerial * 8; - dataByteLen -= laneOffsetSerial * 8; - } - return data - dataStart; - } -} - -size_t KeccakP1600times4_12rounds_FastLoop_Absorb( - void *states, unsigned int laneCount, unsigned int laneOffsetParallel, - unsigned int laneOffsetSerial, const unsigned char *data, - size_t dataByteLen) { - if (laneCount == 21) { -#if 0 - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - V256 *stateAsLanes = states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; -#define Xor_In(argIndex) \ - XOReq256(stateAsLanes[argIndex], \ - LOAD4_64(curData3[argIndex], curData2[argIndex], \ - curData1[argIndex], curData0[argIndex])) -#define Xor_In4(argIndex) \ - lanes0 = LOAD256u(curData0[argIndex]), \ - lanes1 = LOAD256u(curData1[argIndex]), \ - lanes2 = LOAD256u(curData2[argIndex]), \ - lanes3 = LOAD256u(curData3[argIndex]), INTLEAVE(), \ - XOReq256(stateAsLanes[argIndex + 0], lanes0), \ - XOReq256(stateAsLanes[argIndex + 1], lanes1), \ - XOReq256(stateAsLanes[argIndex + 2], lanes2), \ - XOReq256(stateAsLanes[argIndex + 3], lanes3) - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - Xor_In4( 16 ); - Xor_In( 20 ); -#undef Xor_In -#undef Xor_In4 - KeccakP1600times4_PermuteAll_12rounds(states); - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - return (const unsigned char *)curData0 - dataStart; -#else - // unsigned int i; - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = - (const UINT64 *)(data + laneOffsetParallel * 1 * SnP_laneLengthInBytes); - const UINT64 *curData2 = - (const UINT64 *)(data + laneOffsetParallel * 2 * SnP_laneLengthInBytes); - const UINT64 *curData3 = - (const UINT64 *)(data + laneOffsetParallel * 3 * SnP_laneLengthInBytes); - V256 *statesAsLanes = states; - declareABCDE - - copyFromState(A, statesAsLanes) while ( - dataByteLen >= (laneOffsetParallel * 3 + laneCount) * 8) { -#define XOR_In(Xxx, argIndex) \ - XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], \ - curData1[argIndex], curData0[argIndex])) - XOR_In(Aba, 0); - XOR_In(Abe, 1); - XOR_In(Abi, 2); - XOR_In(Abo, 3); - XOR_In(Abu, 4); - XOR_In(Aga, 5); - XOR_In(Age, 6); - XOR_In(Agi, 7); - XOR_In(Ago, 8); - XOR_In(Agu, 9); - XOR_In(Aka, 10); - XOR_In(Ake, 11); - XOR_In(Aki, 12); - XOR_In(Ako, 13); - XOR_In(Aku, 14); - XOR_In(Ama, 15); - XOR_In(Ame, 16); - XOR_In(Ami, 17); - XOR_In(Amo, 18); - XOR_In(Amu, 19); - XOR_In(Asa, 20); -#undef XOR_In - rounds12 curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial * 8; - } - copyToState(statesAsLanes, A) return (const unsigned char *)curData0 - - dataStart; -#endif - } else { - // unsigned int i; - const unsigned char *dataStart = data; - - while (dataByteLen >= (laneOffsetParallel * 3 + laneCount) * 8) { - KeccakP1600times4_AddLanesAll(states, data, laneCount, - laneOffsetParallel); - KeccakP1600times4_PermuteAll_12rounds(states); - data += laneOffsetSerial * 8; - dataByteLen -= laneOffsetSerial * 8; - } - return data - dataStart; - } -} - #else // Dummy constant to keep compiler happy despite empty CU diff --git a/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SnP.h b/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SnP.h index 35c2620e3..83f029946 100644 --- a/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SnP.h +++ b/fips202/native/x86_64/xkcp/KeccakP-1600-times4-SnP.h @@ -21,100 +21,14 @@ and related or neighboring rights to the source code in this file. /** For the documentation, see PlSnP-documentation.h. */ +#include #include "KeccakP-SIMD256-config.h" #include "namespace.h" -#define KeccakP1600times4_implementation \ - "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" -#define KeccakP1600times4_statesSizeInBytes 800 #define KeccakP1600times4_statesAlignment 32 -#define KeccakF1600times4_FastLoop_supported -#define KeccakP1600times4_12rounds_FastLoop_supported - -#include - -#define KeccakP1600times4_InitializeAll \ - FIPS202_NAMESPACE(KeccakP1600times4_InitializeAll) -void KeccakP1600times4_InitializeAll(void *states); - -#define KeccakP1600times4_AddBytes FIPS202_NAMESPACE(KeccakP1600times4_AddBytes) -void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, - const unsigned char *data, unsigned int offset, - unsigned int length); - -#define KeccakP1600times4_AddLanesAll \ - FIPS202_NAMESPACE(KeccakP1600times4_AddLanesAll) -void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, - unsigned int laneCount, - unsigned int laneOffset); - -#define KeccakP1600times4_OverwriteBytes \ - FIPS202_NAMESPACE(KeccakP1600times4_OverwriteBytes) -void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, - const unsigned char *data, - unsigned int offset, unsigned int length); - -#define KeccakP1600times4_OverwriteLanesAll \ - FIPS202_NAMESPACE(KeccakP1600times4_OverwriteLanesAll) -void KeccakP1600times4_OverwriteLanesAll(void *states, - const unsigned char *data, - unsigned int laneCount, - unsigned int laneOffset); - -#define KeccakP1600times4_OverwriteWithZeroes \ - FIPS202_NAMESPACE(KeccakP1600times4_OverwriteWithZeroes) -void KeccakP1600times4_OverwriteWithZeroes(void *states, - unsigned int instanceIndex, - unsigned int byteCount); - -#define KeccakP1600times4_PermuteAll_12rounds \ - FIPS202_NAMESPACE(KeccakP1600times4_PermuteAll_12rounds) -void KeccakP1600times4_PermuteAll_12rounds(void *states); #define KeccakP1600times4_PermuteAll_24rounds \ FIPS202_NAMESPACE(KeccakP1600times4_PermuteAll_24rounds) void KeccakP1600times4_PermuteAll_24rounds(void *states); -#define KeccakP1600times4_ExtractBytes \ - FIPS202_NAMESPACE(KeccakP1600times4_ExtractBytes) -void KeccakP1600times4_ExtractBytes(const void *states, - unsigned int instanceIndex, - unsigned char *data, unsigned int offset, - unsigned int length); - -#define KeccakP1600times4_ExtractLanesAll \ - FIPS202_NAMESPACE(KeccakP1600times4_ExtractLanesAll) -void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, - unsigned int laneCount, - unsigned int laneOffset); - -#define KeccakP1600times4_ExtractAndAddBytes \ - FIPS202_NAMESPACE(KeccakP1600times4_ExtractAndAddBytes) -void KeccakP1600times4_ExtractAndAddBytes( - const void *states, unsigned int instanceIndex, const unsigned char *input, - unsigned char *output, unsigned int offset, unsigned int length); - -#define KeccakP1600times4_ExtractAndAddLanesAll \ - FIPS202_NAMESPACE(KeccakP1600times4_ExtractAndAddLanesAll) -void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, - const unsigned char *input, - unsigned char *output, - unsigned int laneCount, - unsigned int laneOffset); - -#define KeccakF1600times4_FastLoop_Absorb \ - FIPS202_NAMESPACE(KeccakF1600times4_FastLoop_Absorb) -size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, - unsigned int laneOffsetParallel, - unsigned int laneOffsetSerial, - const unsigned char *data, - size_t dataByteLen); - -#define KeccakP1600times4_12rounds_FastLoop_Absorb \ - FIPS202_NAMESPACE(KeccakP1600times4_12rounds_FastLoop_Absorb) -size_t KeccakP1600times4_12rounds_FastLoop_Absorb( - void *states, unsigned int laneCount, unsigned int laneOffsetParallel, - unsigned int laneOffsetSerial, const unsigned char *data, - size_t dataByteLen); - #endif diff --git a/fips202/native/x86_64/xkcp/KeccakP-1600-unrolling.macros b/fips202/native/x86_64/xkcp/KeccakP-1600-unrolling.macros index ba0b387df..e5768a774 100644 --- a/fips202/native/x86_64/xkcp/KeccakP-1600-unrolling.macros +++ b/fips202/native/x86_64/xkcp/KeccakP-1600-unrolling.macros @@ -15,7 +15,6 @@ and related or neighboring rights to the source code in this file. http://creativecommons.org/publicdomain/zero/1.0/ */ -#if (defined(FullUnrolling)) #define rounds24 \ prepareTheta \ thetaRhoPiChiIotaPrepareTheta( 0, A, E) \ @@ -43,158 +42,3 @@ http://creativecommons.org/publicdomain/zero/1.0/ thetaRhoPiChiIotaPrepareTheta(22, A, E) \ thetaRhoPiChiIota(23, E, A) \ -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 12) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=12) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 6) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#elif (Unrolling == 4) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#elif (Unrolling == 3) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#elif (Unrolling == 2) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#elif (Unrolling == 1) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#else -#error "Unrolling is not correctly specified!" -#endif - -#define roundsN(__nrounds) \ - prepareTheta \ - i = 24 - (__nrounds); \ - if ((i&1) != 0) { \ - thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - copyStateVariables(A, E) \ - ++i; \ - } \ - for( /* empty */; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - }