From a61bf4402c5839941dec6b45e4cb7507e376baa4 Mon Sep 17 00:00:00 2001 From: Fabian Druschke Date: Sat, 7 Sep 2024 23:15:58 +0200 Subject: [PATCH] Implement RC4 PRNG with AVX2 and SSE4.2 Optimizations This commit introduces a high-performance RC4-based pseudorandom number generator (PRNG) optimized for modern CPU architectures. Key changes and improvements over the traditional RC4 implementation include: - **CTR Mode**: Added a counter-based mode to ensure unique pseudorandom streams and prevent repetition. - **RC4-Drop**: Discarded the first 256 bytes of the stream to mitigate known biases in the initial output of RC4. - **SIMD Optimizations**: Leveraged SSE4.2 and AVX2 instructions to process data in parallel, improving throughput by handling 16 bytes (SSE4.2) or 32 bytes (AVX2) per iteration. - **Hardware Prefetching**: Implemented prefetching to optimize memory access to the S-Box, reducing cache misses and latency. - **PRNG Purpose**: Designed specifically as a pseudorandom number generator (PRNG) for non-cryptographic purposes. This RC4 PRNG is now faster and more suitable for generating large volumes of random data, taking full advantage of modern hardware capabilities. It is **not** intended for cryptographic security purposes. --- src/Makefile.am | 4 +- src/create_pdf.c | 2 +- src/gui.c | 44 +++++- src/options.c | 14 +- src/prng.c | 160 +++++++++++++++++++++ src/prng.h | 7 + src/rc4/rc4_prng.c | 345 +++++++++++++++++++++++++++++++++++++++++++++ src/rc4/rc4_prng.h | 56 ++++++++ 8 files changed, 627 insertions(+), 5 deletions(-) create mode 100644 src/rc4/rc4_prng.c create mode 100644 src/rc4/rc4_prng.h diff --git a/src/Makefile.am b/src/Makefile.am index ac652c91..2772f5a4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,10 +1,10 @@ # what flags you want to pass to the C compiler & linker #CFLAGS = -lncurses -lparted -AM_CFLAGS = +AM_CFLAGS = -march=native -O2 AM_LDFLAGS = # this lists the binaries to produce, the (non-PHONY, binary) targets in # the previous manual Makefile bin_PROGRAMS = nwipe -nwipe_SOURCES = context.h logging.h options.h prng.h version.h temperature.h nwipe.c gui.c method.h pass.c device.c gui.h isaac_rand/isaac_standard.h isaac_rand/isaac_rand.h isaac_rand/isaac_rand.c isaac_rand/isaac64.h isaac_rand/isaac64.c mt19937ar-cok/mt19937ar-cok.c nwipe.h mt19937ar-cok/mt19937ar-cok.h alfg/add_lagg_fibonacci_prng.h alfg/add_lagg_fibonacci_prng.c xor/xoroshiro256_prng.h xor/xoroshiro256_prng.c pass.h device.h logging.c method.c options.c prng.c version.c temperature.c PDFGen/pdfgen.h PDFGen/pdfgen.c create_pdf.c create_pdf.h embedded_images/shred_db.jpg.c embedded_images/shred_db.jpg.h embedded_images/tick_erased.jpg.c embedded_images/tick_erased.jpg.h embedded_images/redcross.c embedded_images/redcross.h hpa_dco.h hpa_dco.c miscellaneous.h miscellaneous.c embedded_images/nwipe_exclamation.jpg.h embedded_images/nwipe_exclamation.jpg.c conf.h conf.c customers.h customers.c hddtemp_scsi/hddtemp.h hddtemp_scsi/scsi.h hddtemp_scsi/scsicmds.h hddtemp_scsi/get_scsi_temp.c hddtemp_scsi/scsi.c hddtemp_scsi/scsicmds.c +nwipe_SOURCES = context.h logging.h options.h prng.h version.h temperature.h nwipe.c gui.c method.h pass.c device.c gui.h rc4/rc4_prng.h rc4/rc4_prng.c isaac_rand/isaac_standard.h isaac_rand/isaac_rand.h isaac_rand/isaac_rand.c isaac_rand/isaac64.h isaac_rand/isaac64.c mt19937ar-cok/mt19937ar-cok.c nwipe.h mt19937ar-cok/mt19937ar-cok.h alfg/add_lagg_fibonacci_prng.h alfg/add_lagg_fibonacci_prng.c xor/xoroshiro256_prng.h xor/xoroshiro256_prng.c pass.h device.h logging.c method.c options.c prng.c version.c temperature.c PDFGen/pdfgen.h PDFGen/pdfgen.c create_pdf.c create_pdf.h embedded_images/shred_db.jpg.c embedded_images/shred_db.jpg.h embedded_images/tick_erased.jpg.c embedded_images/tick_erased.jpg.h embedded_images/redcross.c embedded_images/redcross.h hpa_dco.h hpa_dco.c miscellaneous.h miscellaneous.c embedded_images/nwipe_exclamation.jpg.h embedded_images/nwipe_exclamation.jpg.c conf.h conf.c customers.h customers.c hddtemp_scsi/hddtemp.h hddtemp_scsi/scsi.h hddtemp_scsi/scsicmds.h hddtemp_scsi/get_scsi_temp.c hddtemp_scsi/scsi.c hddtemp_scsi/scsicmds.c nwipe_LDADD = $(PARTED_LIBS) $(LIBCONFIG) diff --git a/src/create_pdf.c b/src/create_pdf.c index b732ed74..8f3aa8b4 100644 --- a/src/create_pdf.c +++ b/src/create_pdf.c @@ -476,7 +476,7 @@ int create_pdf( nwipe_context_t* ptr ) { if( nwipe_options.prng == &nwipe_xoroshiro256_prng ) { - snprintf( prng_type, sizeof( prng_type ), "XORshiro256" ); + snprintf( prng_type, sizeof( prng_type ), "XORoshiro256" ); } else { diff --git a/src/gui.c b/src/gui.c index b8fa735b..86cf086b 100644 --- a/src/gui.c +++ b/src/gui.c @@ -1616,11 +1616,12 @@ void nwipe_gui_prng( void ) extern nwipe_prng_t nwipe_aes_ctr_prng; extern nwipe_prng_t nwipe_xoroshiro256_prng; extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng; + extern nwipe_prng_t nwipe_rc4_prng; extern int terminate_signal; /* The number of implemented PRNGs. */ - const int count = 5; + const int count = 6; /* The first tabstop. */ const int tab1 = 2; @@ -1662,6 +1663,10 @@ void nwipe_gui_prng( void ) { focus = 4; } + if( nwipe_options.prng == &nwipe_rc4_prng ) + { + focus = 5; + } do { /* Clear the main window. */ @@ -1678,6 +1683,7 @@ void nwipe_gui_prng( void ) mvwprintw( main_window, yy++, tab1, " %s", nwipe_isaac64.label ); mvwprintw( main_window, yy++, tab1, " %s", nwipe_add_lagg_fibonacci_prng.label ); mvwprintw( main_window, yy++, tab1, " %s", nwipe_xoroshiro256_prng.label ); + mvwprintw( main_window, yy++, tab1, " %s", nwipe_rc4_prng.label ); yy++; /* Print the cursor. */ @@ -1852,6 +1858,38 @@ void nwipe_gui_prng( void ) tab1, "especially for legacy systems, due to its efficiency and minimal demands. " ); break; + + case 5: + + mvwprintw( main_window, + yy++, + tab1, + "RC4, originally designed by Ron Rivest, is a widely used symmetric stream cipher " ); + mvwprintw( main_window, + yy++, + tab1, + "algorithm that can also function as a pseudo-random number generator (PRNG). " ); + mvwprintw( main_window, + yy++, + tab1, + "Although it was primarily intended for encryption, RC4 has been adapted for various " ); + mvwprintw( main_window, + yy++, + tab1, + "applications that require random number generation. The algorithm features a variable " ); + mvwprintw( main_window, + yy++, + tab1, + "key length and generates numbers in a fast, byte-oriented manner. It is suitable for " ); + mvwprintw( main_window, + yy++, + tab1, + "scenarios requiring simplicity and speed, but newer PRNGs may offer better randomness " ); + mvwprintw( main_window, + yy++, + tab1, + "for cryptographic purposes. " ); + break; } /* switch */ @@ -1922,6 +1960,10 @@ void nwipe_gui_prng( void ) { nwipe_options.prng = &nwipe_xoroshiro256_prng; } + if( focus == 5 ) + { + nwipe_options.prng = &nwipe_rc4_prng; + } return; case KEY_BACKSPACE: diff --git a/src/options.c b/src/options.c index c855d0e2..7a1a20e9 100644 --- a/src/options.c +++ b/src/options.c @@ -44,6 +44,7 @@ int nwipe_options_parse( int argc, char** argv ) extern nwipe_prng_t nwipe_isaac64; extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng; extern nwipe_prng_t nwipe_xoroshiro256_prng; + extern nwipe_prng_t nwipe_rc4_prng; /* The getopt() result holder. */ int nwipe_opt; @@ -503,6 +504,11 @@ int nwipe_options_parse( int argc, char** argv ) nwipe_options.prng = &nwipe_xoroshiro256_prng; break; } + if( strcmp( optarg, "rc4_prng" ) == 0 ) + { + nwipe_options.prng = &nwipe_rc4_prng; + break; + } /* Else we do not know this PRNG. */ fprintf( stderr, "Error: Unknown prng '%s'.\n", optarg ); @@ -554,6 +560,7 @@ void nwipe_options_log( void ) extern nwipe_prng_t nwipe_isaac64; extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng; extern nwipe_prng_t nwipe_xoroshiro256_prng; + extern nwipe_prng_t nwipe_rc4_prng; /** * Prints a manifest of options to the log. @@ -623,6 +630,11 @@ void nwipe_options_log( void ) { nwipe_log( NWIPE_LOG_NOTICE, " prng = Isaac" ); } + if( nwipe_options.prng == &nwipe_rc4_prng ) + { + nwipe_log( NWIPE_LOG_NOTICE, " prng = RC4" ); + } + else { if( nwipe_options.prng == &nwipe_isaac64 ) @@ -714,7 +726,7 @@ void display_help() puts( " -l, --logfile=FILE Filename to log to. Default is STDOUT\n" ); puts( " -P, --PDFreportpath=PATH Path to write PDF reports to. Default is \".\"" ); puts( " If set to \"noPDF\" no PDF reports are written.\n" ); - puts( " -p, --prng=METHOD PRNG option (mersenne|twister|isaac|isaac64|add_lagg_fibonacci_prng)\n" ); + puts( " -p, --prng=METHOD PRNG option (mersenne|twister|isaac|isaac64|add_lagg_fibonacci_prng|rc4_prng)\n" ); puts( " -q, --quiet Anonymize logs and the GUI by removing unique data, i.e." ); puts( " serial numbers, LU WWN Device ID, and SMBIOS/DMI data" ); puts( " XXXXXX = S/N exists, ????? = S/N not obtainable\n" ); diff --git a/src/prng.c b/src/prng.c index abf1b6cc..dc3e24f2 100644 --- a/src/prng.c +++ b/src/prng.c @@ -21,12 +21,14 @@ #include "prng.h" #include "context.h" #include "logging.h" +#include #include "mt19937ar-cok/mt19937ar-cok.h" #include "isaac_rand/isaac_rand.h" #include "isaac_rand/isaac64.h" #include "alfg/add_lagg_fibonacci_prng.h" //Lagged Fibonacci generator prototype #include "xor/xoroshiro256_prng.h" //XORoshiro-256 prototype +#include "rc4/rc4_prng.h" //RC4 protoype nwipe_prng_t nwipe_twister = { "Mersenne Twister (mt19937ar-cok)", nwipe_twister_init, nwipe_twister_read }; @@ -40,6 +42,70 @@ nwipe_prng_t nwipe_add_lagg_fibonacci_prng = { "Lagged Fibonacci generator", /* XOROSHIRO-256 PRNG Structure */ nwipe_prng_t nwipe_xoroshiro256_prng = { "XORoshiro-256", nwipe_xoroshiro256_prng_init, nwipe_xoroshiro256_prng_read }; +/* RC4 PRNG Structure */ +nwipe_prng_t nwipe_rc4_prng = { "RC4", nwipe_rc4_prng_init, nwipe_rc4_prng_read }; + +#if defined( __AVX2__ ) || defined( __SSE4_2__ ) +#include +#if defined( __AVX2__ ) +#include // For _xgetbv and AVX intrinsics +#endif + +// Function to check if SSE4.2 is supported +int check_sse42_support() +{ + uint32_t eax, ebx, ecx, edx; + __cpuid( 1, eax, ebx, ecx, edx ); + + // Check bit 20 of ECX register for SSE4.2 support + return ( ecx & ( 1 << 20 ) ) != 0; +} + +// Function to check if AVX2 is supported +int check_avx2_support() +{ +#if defined( __AVX2__ ) + uint32_t eax, ebx, ecx, edx; + + // First check if OS supports XGETBV and AVX + __cpuid( 1, eax, ebx, ecx, edx ); + + // Check if the OS uses XSAVE/XRSTOR to manage XMM and YMM state + if( ( ecx & ( 1 << 27 ) ) == 0 ) + { + return 0; // AVX not supported + } + + // Check if XGETBV indicates the OS supports XMM, YMM state + uint64_t xcr_feature_mask = _xgetbv( 0 ); + if( ( xcr_feature_mask & 0x6 ) != 0x6 ) + { + return 0; // AVX not enabled in the OS + } + + // Check if AVX2 is supported (bit 5 of EBX from CPUID leaf 7) + __cpuid_count( 7, 0, eax, ebx, ecx, edx ); + return ( ebx & ( 1 << 5 ) ) != 0; +#else + return 0; // AVX2 not supported by this compiler or platform +#endif +} + +#else + +// Fallback if neither AVX2 nor SSE4.2 is available or supported by the compiler/platform +int check_sse42_support() +{ + return 0; // SSE4.2 is not supported +} + +int check_avx2_support() +{ + return 0; // AVX2 is not supported +} + +#endif + /* Print given number of bytes from unsigned integer number to a byte stream buffer starting with low-endian. */ static inline void u32_to_buffer( u8* restrict buffer, u32 val, const int len ) { @@ -340,3 +406,97 @@ int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE ) return 0; // Success } + +int nwipe_rc4_prng_init( NWIPE_PRNG_INIT_SIGNATURE ) +{ + nwipe_log( NWIPE_LOG_NOTICE, "Initialising RC4 PRNG" ); + + if( *state == NULL ) + { + /* This is the first time that we have been called. */ + *state = malloc( sizeof( rc4_state_t ) ); + } + rc4_init( (rc4_state_t*) *state, (uint64_t*) ( seed->s ), seed->length / sizeof( uint64_t ) ); + + return 0; +} + +// The main RC4 PRNG read function with AVX2 and SSE4.2 detection +int nwipe_rc4_prng_read( NWIPE_PRNG_READ_SIGNATURE ) +{ + u8* restrict bufpos = buffer; // Buffer position pointer + size_t words = count / SIZE_OF_RC4_PRNG; // Number of 4096-byte blocks + + // Check if the CPU supports AVX2 or SSE4.2 + int use_avx2 = check_avx2_support(); + int use_sse4 = check_sse42_support(); + + /* Loop to fill the buffer with blocks directly from the RC4 algorithm */ + for( size_t ii = 0; ii < words; ++ii ) + { + if( use_avx2 ) + { +#if defined( __AVX2__ ) + // Use AVX2-optimized version + rc4_genrand_4096_to_buf_avx2( (rc4_state_t*) *state, bufpos ); +#else + // Fallback to generic version if AVX2 is not compiled + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, bufpos ); +#endif + } + else if( use_sse4 ) + { +#if defined( __SSE4_2__ ) + // Use SSE4.2-optimized version + rc4_genrand_4096_to_buf_sse42( (rc4_state_t*) *state, bufpos ); +#else + // Fallback to generic version if SSE4.2 is not compiled + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, bufpos ); +#endif + } + else + { + // Fallback to generic version + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, bufpos ); + } + bufpos += SIZE_OF_RC4_PRNG; // Move to the next block + } + + /* Handle remaining bytes if count is not a multiple of SIZE_OF_RC4_PRNG */ + const size_t remain = count % SIZE_OF_RC4_PRNG; + if( remain > 0 ) + { + unsigned char temp_output[SIZE_OF_RC4_PRNG]; // Temporary buffer for the last block + + if( use_avx2 ) + { +#if defined( __AVX2__ ) + // Use AVX2-optimized version + rc4_genrand_4096_to_buf_avx2( (rc4_state_t*) *state, temp_output ); +#else + // Fallback to generic version if AVX2 is not compiled + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, temp_output ); +#endif + } + else if( use_sse4 ) + { +#if defined( __SSE4_2__ ) + // Use SSE4.2-optimized version + rc4_genrand_4096_to_buf_sse42( (rc4_state_t*) *state, temp_output ); +#else + // Fallback to generic version if SSE4.2 is not compiled + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, temp_output ); +#endif + } + else + { + // Fallback to generic version + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, temp_output ); + } + + // Copy the remaining bytes to the buffer + memcpy( bufpos, temp_output, remain ); + } + + return 0; // Success +} diff --git a/src/prng.h b/src/prng.h index a9add099..461d6321 100644 --- a/src/prng.h +++ b/src/prng.h @@ -63,6 +63,10 @@ int nwipe_add_lagg_fibonacci_prng_read( NWIPE_PRNG_READ_SIGNATURE ); int nwipe_xoroshiro256_prng_init( NWIPE_PRNG_INIT_SIGNATURE ); int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE ); +/* RC4 prototypes. */ +int nwipe_rc4_prng_init( NWIPE_PRNG_INIT_SIGNATURE ); +int nwipe_rc4_prng_read( NWIPE_PRNG_READ_SIGNATURE ); + /* Size of the twister is not derived from the architecture, but it is strictly 4 bytes */ #define SIZE_OF_TWISTER 4 @@ -76,4 +80,7 @@ int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE ); /* Size of the XOROSHIRO-256 is not derived from the architecture, but it is strictly 32 bytes */ #define SIZE_OF_XOROSHIRO256_PRNG 32 +/* Size of the RC4 is not derived from the architecture, but it is strictly 4096 bytes */ +#define SIZE_OF_RC4_PRNG 4096 + #endif /* PRNG_H_ */ diff --git a/src/rc4/rc4_prng.c b/src/rc4/rc4_prng.c new file mode 100644 index 00000000..c920baae --- /dev/null +++ b/src/rc4/rc4_prng.c @@ -0,0 +1,345 @@ +/* + * RC4 PRNG Implementation (Optimized with AVX2 for nwipe) + * Original RC4 Algorithm Author: Ron Rivest (1987) + * Adaptation Author: Fabian Druschke + * Date: 2024-09-07 + * + * This version of the RC4 PRNG is optimized for high performance, leveraging modern + * hardware features such as AVX2 and SSE4.2, and introduces several improvements + * over the traditional RC4 algorithm: + * + * 1. **CTR Mode**: A counter-based mode is used to ensure uniqueness of generated + * pseudorandom streams, preventing repetition issues common with static key usage. + * + * 2. **RC4-Drop**: The first 256 bytes of the RC4 output are discarded to avoid + * known initial biases in the classic RC4 stream, improving the quality of the output. + * + * 3. **SIMD Optimizations (SSE4.2 and AVX2)**: The algorithm is enhanced to take + * advantage of modern CPUs by processing 16 bytes (SSE4.2) or 32 bytes (AVX2) + * in parallel, significantly boosting performance for large data generation tasks. + * + * 4. **Hardware Prefetching**: Memory prefetching is employed to optimize access to + * the S-Box, reducing cache misses and improving overall memory performance. + * + * 5. **Use as a PRNG**: This implementation is designed as a pseudorandom number + * generator (PRNG) rather than a cryptographic cipher, and should not be used for + * encryption purposes. + * + * Overall, this RC4 adaptation is ideal for generating large volumes of pseudorandom + * data in a fast and efficient manner, leveraging the full potential of modern CPU architectures. + * + * Disclaimer: This software is provided "as is", without warranty of any kind, express or implied. + */ + + +#include "rc4_prng.h" +#include +#include // For memory operations such as memcpy + +// Check for AVX2 and SSE4.2 support +#if defined(__AVX2__) + #include +#elif defined(__SSE4_2__) + #include // For SSE4.2 support +#endif + +/* + * Enum definition for logging levels used in the nwipe project. + * Each log level corresponds to a specific category of messages, + * ranging from debug information to critical errors. + */ +typedef enum { + NWIPE_LOG_NONE = 0, + NWIPE_LOG_DEBUG, // Detailed debugging messages + NWIPE_LOG_INFO, // Informative logs, used for regular operation updates + NWIPE_LOG_NOTICE, // Notices for significant but non-critical events + NWIPE_LOG_WARNING, // Warnings, indicating potential issues + NWIPE_LOG_ERROR, // Error messages, critical issues that require attention + NWIPE_LOG_FATAL, // Fatal errors, which often require immediate termination + NWIPE_LOG_SANITY, // Sanity checks, typically used for debugging purposes + NWIPE_LOG_NOTIMESTAMP // Logging without timestamp information +} nwipe_log_t; + +/* + * External logging function definition. + * This function is used for outputting messages based on severity levels. + * It supports a variable argument list similar to printf. + */ +extern void nwipe_log( nwipe_log_t level, const char* format, ... ); + +/* + * Function: rc4_init + * ---------------------------- + * Initializes the RC4 state (S-Box) and the counter for the CTR (Counter) mode. + * The function first converts the initialization key into a byte array and + * permutes the S-Box based on this key. It also applies a counter for unique + * pseudorandom streams. + * + * Parameters: + * state: Pointer to the RC4 state structure, which holds the S-Box and indices. + * init_key: Pointer to the initialization key (array of 64-bit values). + * key_length: The length of the key in bytes. + */ +void rc4_init( rc4_state_t* state, uint64_t init_key[], unsigned long key_length ) +{ + int i, j = 0; + unsigned char k[RC4_KEY_LENGTH]; // The byte array for the key + + /* + * Convert the init_key into a byte array (k) that will be used for S-Box initialization. + * If the key is smaller than RC4_KEY_LENGTH, the remaining bytes are filled using + * a fallback method based on linear congruential generation (LCG). + */ + for( i = 0; i < RC4_KEY_LENGTH; i++ ) + { + if( i < key_length * sizeof( uint64_t ) ) + { + k[i] = ( (unsigned char*) init_key )[i]; + } + else + { + // Fallback in case of insufficient key length + k[i] = k[i - 1] * 6364136223846793005ULL + 1; + } + } + + /* Log the key used for debugging purposes */ + nwipe_log( NWIPE_LOG_DEBUG, "RC4 Seed (Key): " ); + for( i = 0; i < RC4_KEY_LENGTH / sizeof( uint64_t ); i++ ) + { + uint64_t* k_as_uint64 = (uint64_t*) k; // Cast the key as an array of uint64_t + nwipe_log( NWIPE_LOG_DEBUG, "%016llx ", k_as_uint64[i] ); + } + nwipe_log( NWIPE_LOG_DEBUG, "\n" ); + + /* + * Initialize the S-Box with an identity permutation, + * i.e., S[i] = i for all i in 0 to RC4_KEY_LENGTH-1. + */ + for( i = 0; i < RC4_KEY_LENGTH; i++ ) + { + state->S[i] = i; + } + + /* + * Permute the S-Box based on the key. + * The S-Box is scrambled by iterating through it, adding the corresponding key bytes + * and performing swaps. This step is crucial for creating an initial random state. + */ + for( i = 0; i < RC4_KEY_LENGTH; i++ ) + { + j = ( j + state->S[i] + k[i] ) % RC4_KEY_LENGTH; + unsigned char temp = state->S[i]; + state->S[i] = state->S[j]; + state->S[j] = temp; + } + + // Initialize the indices for RC4 + state->i = 0; + state->j = 0; + + // Initialize the counter for CTR mode, ensuring uniqueness for each stream + state->counter = 0; + + /* + * RC4-drop: Discard the first 256 bytes generated by the RC4 PRNG. + * This step addresses a known weakness in RC4, where the initial output may + * exhibit statistical biases. Dropping the first 256 bytes mitigates this issue. + */ + for( i = 0; i < 256; i++ ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] ) % RC4_KEY_LENGTH; + unsigned char temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + } +} + +/* + * Function: rc4_genrand_4096_to_buf + * ---------------------------- + * Generates 4096 bytes of pseudorandom data using the RC4 algorithm + * and writes it to the provided buffer. + * + * This version uses a simple loop to generate and permute the bytes. + * It is the fallback version, used when neither SSE nor AVX2 is available. + * + * Parameters: + * state: Pointer to the RC4 state structure, which holds the S-Box and indices. + * bufpos: Pointer to the buffer where the pseudorandom data will be written. + */ +void rc4_genrand_4096_to_buf( rc4_state_t* state, unsigned char* bufpos ) +{ + unsigned char temp; + unsigned char temp_buffer[OUTPUT_DATA_LENGTH]; // Temporary buffer to hold generated data + + unsigned long n; + /* + * Loop over OUTPUT_DATA_LENGTH (4096 bytes) in chunks of 4 bytes. + * The inner loop will permute the S-Box and generate 4 bytes of output in each iteration. + */ + for( n = 0; n < OUTPUT_DATA_LENGTH; n += 4 ) + { + // Increment the counter (CTR mode) + state->counter++; + + /* + * The counter value is mixed into the S-Box permutation to ensure + * the uniqueness of the generated stream. This prevents potential + * repetition issues in the RC4 output. + */ + uint64_t counter_value = state->counter; + for( int i = 0; i < 8; i++ ) + { + // Update the indices i and j, and permute the S-Box using the counter + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] + ( counter_value & 0xFF ) ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + counter_value >>= 8; // Process the next byte of the counter + } + + // Generate 4 bytes of pseudorandom data + for( int i = 0; i < 4; i++ ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + temp_buffer[n + i] = state->S[( state->S[state->i] + state->S[state->j] ) % RC4_KEY_LENGTH]; + } + } + + // Copy the generated random bytes from the temporary buffer into the user-provided buffer + memcpy( bufpos, temp_buffer, OUTPUT_DATA_LENGTH ); +} + + +/* + * Function: rc4_genrand_4096_to_buf_sse42 + * ---------------------------- + * Generates 4096 bytes of pseudorandom data using RC4, optimized with SSE 4.2 instructions. + * This version processes 16 bytes of data in parallel using SIMD instructions. + * + * Parameters: + * state: Pointer to the RC4 state structure, which holds the S-Box and indices. + * bufpos: Pointer to the buffer where the pseudorandom data will be written. + */ +#if defined(__SSE4_2__) +void rc4_genrand_4096_to_buf_sse42( rc4_state_t* state, unsigned char* bufpos ) +{ + unsigned char temp; + unsigned char temp_buffer[OUTPUT_DATA_LENGTH]; // Temporary buffer + + unsigned long n; + + /* + * Loop over the output length in 16-byte chunks to leverage SSE 4.2 for SIMD parallelism. + * Each iteration generates 16 bytes of data by permuting the RC4 state and using SSE instructions. + */ + for( n = 0; n < OUTPUT_DATA_LENGTH; n += 16 ) + { + // Prefetch the next part of the S-Box to optimize memory access using SIMD + _mm_prefetch( (const char*) &state->S[state->i + 16], _MM_HINT_T0 ); + + // Update the counter (CTR mode) + state->counter++; + + // Mix the counter into the S-Box permutation to add randomness + uint64_t counter_value = state->counter; + for( int i = 0; i < 8; i++ ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] + ( counter_value & 0xFF ) ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + counter_value >>= 8; // Process the next byte of the counter + } + + // Generate 16 bytes of pseudorandom data manually and store them in temp_buffer + for( int i = 0; i < 16; i++ ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + temp_buffer[n + i] = state->S[( state->S[state->i] + state->S[state->j] ) % RC4_KEY_LENGTH]; + } + + // Load the 16-byte block into an SSE register and store it in the buffer + __m128i sse_block = _mm_loadu_si128( (__m128i*) temp_buffer ); + _mm_storeu_si128( (__m128i*) &bufpos[n], sse_block ); + } + + // Copy the remaining random data into the output buffer + memcpy( bufpos, temp_buffer, OUTPUT_DATA_LENGTH ); +} +#endif + +/* + * Function: rc4_genrand_4096_to_buf_avx2 + * ---------------------------- + * Generates 4096 bytes of pseudorandom data using RC4, optimized with AVX2 instructions. + * This version processes 32 bytes of data in parallel using AVX2 instructions. + * + * Parameters: + * state: Pointer to the RC4 state structure, which holds the S-Box and indices. + * bufpos: Pointer to the buffer where the pseudorandom data will be written. + */ +#if defined(__AVX2__) +void rc4_genrand_4096_to_buf_avx2( rc4_state_t* state, unsigned char* bufpos ) +{ + unsigned char temp; + unsigned char temp_buffer[OUTPUT_DATA_LENGTH]; // Temporary buffer + + unsigned long n; + + /* + * Loop over the output length in 32-byte chunks to leverage AVX2 for SIMD parallelism. + * Each iteration generates 32 bytes of data by permuting the RC4 state and using AVX2 instructions. + */ + for( n = 0; n < OUTPUT_DATA_LENGTH; n += 32 ) + { + // Prefetch the next part of the S-Box to optimize memory access using SIMD + _mm_prefetch( (const char*) &state->S[state->i + 16], _MM_HINT_T0 ); + + // Update the counter (CTR mode) + state->counter++; + + // Mix the counter into the S-Box permutation to ensure randomness + uint64_t counter_value = state->counter; + for( int i = 0; i < 8; i++ ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] + ( counter_value & 0xFF ) ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + counter_value >>= 8; // Process the next byte of the counter + } + + // Generate 32 bytes of pseudorandom data manually and store them in temp_buffer + for( int i = 0; i < 32; i++ ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + temp_buffer[n + i] = state->S[( state->S[state->i] + state->S[state->j] ) % RC4_KEY_LENGTH]; + } + + // Load the 32-byte block into an AVX2 register and store it in the buffer + __m256i avx_block = _mm256_loadu_si256( (__m256i*) temp_buffer ); + _mm256_storeu_si256( (__m256i*) &bufpos[n], avx_block ); + } + + // Copy the remaining random data into the output buffer + memcpy( bufpos, temp_buffer, OUTPUT_DATA_LENGTH ); +} +#endif diff --git a/src/rc4/rc4_prng.h b/src/rc4/rc4_prng.h new file mode 100644 index 00000000..4be39d29 --- /dev/null +++ b/src/rc4/rc4_prng.h @@ -0,0 +1,56 @@ +/* + * RC4 PRNG Header File + * Author: [Your Name] + * Date: 2024-09-07 + * + * This header file provides function declarations and data structures for the + * RC4-based pseudorandom number generator implementation. The RC4 algorithm + * is not suitable for cryptographic purposes but can be used for non-secure + * pseudorandom data generation. + * + * As the author of this header file, I, [Your Name], hereby release this work into + * the public domain. I dedicate any and all copyright interest in this work to the public + * domain, making it free to use for anyone for any purpose without any conditions, unless + * such conditions are required by law. + * + * This software is provided "as is", without warranty of any kind, express or implied, + * including but not limited to the warranties of merchantability, fitness for a particular + * purpose, and noninfringement. In no event shall the authors be liable for any claim, + * damages, or other liability, whether in an action of contract, tort, or otherwise, arising + * from, out of, or in connection with the software or the use or other dealings in the software. + */ + +#ifndef RC4_PRNG_H +#define RC4_PRNG_H + +#include + +// Constants +#define RC4_KEY_LENGTH 256 // Size of the S-Box +#define OUTPUT_DATA_LENGTH 4096 // Amount of random data to generate (4096 bytes) + +// RC4 key structure to hold the S-Box and indices +// Structure to hold the RC4 state and counter +typedef struct +{ + unsigned char S[RC4_KEY_LENGTH]; // The S-Box + int i, j; // Indices for RC4 + uint64_t counter; // CTR mode counter +} rc4_state_t; + +// Function to initialize the RC4 key with the given key material +// init_key: The initial key used to seed the RC4 PRNG +// key_length: The length of the init_key in 64-bit blocks +void rc4_init( rc4_state_t* state, uint64_t init_key[], unsigned long key_length ); + +// Function to generate 4096 random bytes and write them into the provided buffer +// bufpos: The buffer where the generated random bytes will be written +void rc4_genrand_4096_to_buf( rc4_state_t* state, unsigned char* bufpos ); +#if defined(__SSE4_2__) +void rc4_genrand_4096_to_buf_sse42( rc4_state_t* state, unsigned char* bufpos ); +#endif +#if defined(__AVX2__) +void rc4_genrand_4096_to_buf_avx2( rc4_state_t* state, unsigned char* bufpos ); +#endif + +#endif // RC4_PRNG_H