diff --git a/src/ballet/chacha20/Local.mk b/src/ballet/chacha20/Local.mk index 82efdefb65..04a15b5619 100644 --- a/src/ballet/chacha20/Local.mk +++ b/src/ballet/chacha20/Local.mk @@ -1,6 +1,6 @@ $(call add-hdrs,fd_chacha20.h fd_chacha20rng.h) -ifdef FD_HAS_AVX -$(call add-objs,fd_chacha20_avx fd_chacha20rng,fd_ballet) +ifdef FD_HAS_SSE +$(call add-objs,fd_chacha20_sse fd_chacha20rng,fd_ballet) else $(call add-objs,fd_chacha20 fd_chacha20rng,fd_ballet) endif diff --git a/src/ballet/chacha20/fd_chacha20.c b/src/ballet/chacha20/fd_chacha20.c index 4fd5eb900f..67d48ebdee 100644 --- a/src/ballet/chacha20/fd_chacha20.c +++ b/src/ballet/chacha20/fd_chacha20.c @@ -19,8 +19,11 @@ fd_chacha20_quarter_round( uint * a, void * fd_chacha20_block( void * _block, void const * _key, - uint idx, - void const * _nonce ) { + void const * _idx_nonce ) { + + uint * block = __builtin_assume_aligned( _block, 64UL ); + uint const * key = __builtin_assume_aligned( _key, 32UL ); + uint const * idx_nonce = __builtin_assume_aligned( _idx_nonce, 16UL ); /* Construct the input ChaCha20 block state as the following matrix of little endian uint entries: @@ -36,18 +39,13 @@ fd_chacha20_block( void * _block, b is the block index n is the nonce */ - uint * block = (uint *)_block; block[ 0 ] = 0x61707865U; block[ 1 ] = 0x3320646eU; block[ 2 ] = 0x79622d32U; block[ 3 ] = 0x6b206574U; - uint const * key = (uint const *)_key; - memcpy( block+ 4, key, 8*sizeof(uint) ); - - block[ 12 ] = idx; - uint const * nonce = (uint const *)_nonce; - memcpy( block+13, nonce, 3*sizeof(uint) ); + memcpy( block+ 4, key, 8*sizeof(uint) ); + memcpy( block+12, idx_nonce, 4*sizeof(uint) ); /* Remember the input state for later use */ diff --git a/src/ballet/chacha20/fd_chacha20.h b/src/ballet/chacha20/fd_chacha20.h index e8cc656365..ff54d061ea 100644 --- a/src/ballet/chacha20/fd_chacha20.h +++ b/src/ballet/chacha20/fd_chacha20.h @@ -15,20 +15,18 @@ FD_PROTOTYPES_BEGIN /* fd_chacha20_block is the ChaCha20 block function. - - block points to the first byte of the output block of 64 bytes size - and 64 bytes alignment - - key points to the first byte of the encryption key of 32 bytes size - - idx is the block index - - nonce points to the first byte of the block nonce of 24 bytes size - and 4 bytes alignment + - block points to the output block (64 byte size, 32 byte align) + - key points to the encryption key (32 byte size, 32 byte align) + - idx_nonce points to the block index and block nonce + (first byte is 32-bit index, rest is 96-bit nonce) + (16 byte size, 16 byte align) FIXME this should probably do multiple blocks */ void * fd_chacha20_block( void * block, void const * key, - uint idx, - void const * nonce ); + void const * idx_nonce ); /* Encryption/decryption functions not implemented for now as they are not yet required. */ diff --git a/src/ballet/chacha20/fd_chacha20_avx.c b/src/ballet/chacha20/fd_chacha20_sse.c similarity index 85% rename from src/ballet/chacha20/fd_chacha20_avx.c rename to src/ballet/chacha20/fd_chacha20_sse.c index 824bbe6fd8..8e9211e605 100644 --- a/src/ballet/chacha20/fd_chacha20_avx.c +++ b/src/ballet/chacha20/fd_chacha20_sse.c @@ -5,8 +5,11 @@ void * fd_chacha20_block( void * _block, void const * _key, - uint idx, - void const * _nonce ) { + void const * _idx_nonce ) { + + uint * block = __builtin_assume_aligned( _block, 64UL ); + uint const * key = __builtin_assume_aligned( _key, 32UL ); + uint const * idx_nonce = __builtin_assume_aligned( _idx_nonce, 16UL ); /* Construct the input ChaCha20 block state as the following matrix of little endian uint entries: @@ -24,9 +27,9 @@ fd_chacha20_block( void * _block, /* Remember the input state for later use */ vu_t row0_init = vu( 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U ); - vu_t row1_init = vu_ldu( (uint const *)_key ); - vu_t row2_init = vu_ldu( (uint const *)_key + 4UL ); - vu_t row3_init = vu_insert( vu_ldu( (uint const *)_nonce - 1UL ), 0, idx ); + vu_t row1_init = vu_ld( key ); + vu_t row2_init = vu_ld( key+4 ); + vu_t row3_init = vu_ld( idx_nonce ); vu_t row0 = row0_init; vu_t row1 = row1_init; @@ -80,10 +83,10 @@ fd_chacha20_block( void * _block, row2 = vu_add( row2, row2_init ); row3 = vu_add( row3, row3_init ); - vu_stu( (uint *)_block , row0 ); - vu_stu( (uint *)_block+ 4UL, row1 ); - vu_stu( (uint *)_block+ 8UL, row2 ); - vu_stu( (uint *)_block+12UL, row3 ); + vu_st( block, row0 ); + vu_st( block+ 4, row1 ); + vu_st( block+ 8, row2 ); + vu_st( block+12, row3 ); return _block; } diff --git a/src/ballet/chacha20/fd_chacha20rng.c b/src/ballet/chacha20/fd_chacha20rng.c index 004467b40a..81bffcf4ed 100644 --- a/src/ballet/chacha20/fd_chacha20rng.c +++ b/src/ballet/chacha20/fd_chacha20rng.c @@ -64,7 +64,7 @@ fd_chacha20rng_init( fd_chacha20rng_t * rng, memcpy( rng->key, key, FD_CHACHA20_KEY_SZ ); rng->buf_off = 0UL; rng->buf_fill = 0UL; - rng->idx = 0U ; + memset( rng->idx_nonce, 0, 16UL ); fd_chacha20rng_private_refill( rng ); return rng; } @@ -72,15 +72,14 @@ fd_chacha20rng_init( fd_chacha20rng_t * rng, void fd_chacha20rng_private_refill( fd_chacha20rng_t * rng ) { ulong fill_target = FD_CHACHA20RNG_BUFSZ - FD_CHACHA20_BLOCK_SZ; - uint nonce[ 3 ]={0}; ulong buf_avail; while( (buf_avail=(rng->buf_fill - rng->buf_off))buf + (rng->buf_fill % FD_CHACHA20RNG_BUFSZ), rng->key, - rng->idx++, - &nonce ); + rng->idx_nonce ); rng->buf_fill += (uint)FD_CHACHA20_BLOCK_SZ; + rng->idx_nonce[0]++; } } diff --git a/src/ballet/chacha20/fd_chacha20rng.h b/src/ballet/chacha20/fd_chacha20rng.h index dd8e188c29..b4d0622aac 100644 --- a/src/ballet/chacha20/fd_chacha20rng.h +++ b/src/ballet/chacha20/fd_chacha20rng.h @@ -41,7 +41,7 @@ struct __attribute__((aligned(32UL))) fd_chacha20rng_private { int mode; /* ChaCha20 block index */ - uint idx; + uint idx_nonce[ 4UL ] __attribute__((aligned(16UL))); }; typedef struct fd_chacha20rng_private fd_chacha20rng_t; @@ -178,7 +178,7 @@ fd_chacha20rng_ulong_roll( fd_chacha20rng_t * rng, = 2^64-1 - (2^64-n)%n, since n<2^64 = 2^64-1 - ((2^64-1)-n+1)%n Which is back to having a mod... But at least if n is a - compile-time constant than the whole zone computation becomes a + compile-time constant then the whole zone computation becomes a compile-time constant. When MODE_SHIFT is set, we use uses almost the largest possible diff --git a/src/ballet/chacha20/test_chacha20.c b/src/ballet/chacha20/test_chacha20.c index 8f76cae3a9..ab2880ec42 100644 --- a/src/ballet/chacha20/test_chacha20.c +++ b/src/ballet/chacha20/test_chacha20.c @@ -13,10 +13,10 @@ test_chacha20_block( void ) { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }; - uchar const nonce[ 24UL ] __attribute__((aligned(4))) = { + uchar const idx_nonce[ 16UL ] __attribute__((aligned(16))) = { + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x00 }; - uint const block_idx = 1U; /* Output */ @@ -32,7 +32,7 @@ test_chacha20_block( void ) { 0xcb, 0xd0, 0x83, 0xe8, 0xa2, 0x50, 0x3c, 0x4e, }; - fd_chacha20_block( &block, &key, block_idx, &nonce ); + fd_chacha20_block( block, &key, idx_nonce ); if( FD_UNLIKELY( 0!=memcmp( block, expected, 64UL ) ) ) FD_LOG_ERR(( "FAIL" @@ -55,10 +55,9 @@ bench_chacha20_block( void ) { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f }; - uchar const nonce[ 24UL ] __attribute__((aligned(4))) = { - 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x00 + uint idx_nonce[ 4UL ] __attribute__((aligned(4))) = { + 0x01, 0x09, 0x00, 0x00, }; - uint block_idx = 1U; uchar block[ 64UL ] __attribute__((aligned(32))); for( ulong idx=0U; idx<2UL; idx++ ) { @@ -66,12 +65,18 @@ bench_chacha20_block( void ) { key[ 0 ]++; /* warmup */ - for( ulong rem=100000UL; rem; rem-- ) fd_chacha20_block( &block, &key, block_idx++, &nonce ); + for( ulong rem=100000UL; rem; rem-- ) { + idx_nonce[0]++; + fd_chacha20_block( block, key, idx_nonce ); + } /* for real */ ulong iter = 1000000UL; long dt = -fd_log_wallclock(); - for( ulong rem=iter; rem; rem-- ) fd_chacha20_block( &block, &key, block_idx++, &nonce ); + for( ulong rem=iter; rem; rem-- ) { + idx_nonce[0]++; + fd_chacha20_block( block, key, idx_nonce ); + } dt += fd_log_wallclock(); double gbps = ((double)(8UL*FD_CHACHA20_BLOCK_SZ*iter)) / ((double)dt); double ns = (double)dt / ((double)iter * (double)FD_CHACHA20_BLOCK_SZ);