diff --git a/src/asm/program_epilogue_linux.inc b/src/asm/program_epilogue_linux.inc index eaacae54..0dc838bd 100644 --- a/src/asm/program_epilogue_linux.inc +++ b/src/asm/program_epilogue_linux.inc @@ -1,10 +1,12 @@ ;# restore callee-saved registers - System V AMD64 ABI - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + mov r15, qword ptr [rsp+280] + mov r14, qword ptr [rsp+272] + mov r13, qword ptr [rsp+264] + mov r12, qword ptr [rsp+256] + mov rbp, qword ptr [rsp+232] + mov rbx, qword ptr [rsp+224] + + add rsp, 456 ;# program finished - ret 0 \ No newline at end of file + ret diff --git a/src/asm/program_epilogue_store.inc b/src/asm/program_epilogue_store.inc index b94fa4d9..5fe0e4a4 100644 --- a/src/asm/program_epilogue_store.inc +++ b/src/asm/program_epilogue_store.inc @@ -1,5 +1,5 @@ ;# save VM register values - pop rcx + mov rcx, qword ptr [rsp+448] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 diff --git a/src/asm/program_epilogue_win64.inc b/src/asm/program_epilogue_win64.inc index 8d70a0a3..727f0068 100644 --- a/src/asm/program_epilogue_win64.inc +++ b/src/asm/program_epilogue_win64.inc @@ -1,24 +1,24 @@ ;# restore callee-saved registers - Microsoft x64 calling convention - movdqu xmm15, xmmword ptr [rsp] - movdqu xmm14, xmmword ptr [rsp+16] - movdqu xmm13, xmmword ptr [rsp+32] - movdqu xmm12, xmmword ptr [rsp+48] - movdqu xmm11, xmmword ptr [rsp+64] - add rsp, 80 - movdqu xmm10, xmmword ptr [rsp] - movdqu xmm9, xmmword ptr [rsp+16] - movdqu xmm8, xmmword ptr [rsp+32] - movdqu xmm7, xmmword ptr [rsp+48] - movdqu xmm6, xmmword ptr [rsp+64] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rsi - pop rdi - pop rbp - pop rbx + movdqa xmm15, xmmword ptr [rsp+432] + movdqa xmm14, xmmword ptr [rsp+416] + movdqa xmm13, xmmword ptr [rsp+400] + movdqa xmm12, xmmword ptr [rsp+384] + movdqa xmm11, xmmword ptr [rsp+368] + movdqa xmm10, xmmword ptr [rsp+352] + movdqa xmm9, xmmword ptr [rsp+336] + movdqa xmm8, xmmword ptr [rsp+320] + movdqa xmm7, xmmword ptr [rsp+304] + movdqa xmm6, xmmword ptr [rsp+288] + mov r15, qword ptr [rsp+280] + mov r14, qword ptr [rsp+272] + mov r13, qword ptr [rsp+264] + mov r12, qword ptr [rsp+256] + mov rdi, qword ptr [rsp+248] + mov rsi, qword ptr [rsp+240] + mov rbp, qword ptr [rsp+232] + mov rbx, qword ptr [rsp+224] + + add rsp, 456 ;# program finished ret diff --git a/src/asm/program_loop_load.inc b/src/asm/program_loop_load.inc index c2933231..f872d071 100644 --- a/src/asm/program_loop_load.inc +++ b/src/asm/program_loop_load.inc @@ -1,5 +1,5 @@ lea rcx, [rsi+rax] - push rcx + mov qword ptr [rsp+8], rcx xor r8, qword ptr [rcx+0] xor r9, qword ptr [rcx+8] xor r10, qword ptr [rcx+16] @@ -9,7 +9,7 @@ xor r14, qword ptr [rcx+48] xor r15, qword ptr [rcx+56] lea rcx, [rsi+rdx] - push rcx + mov qword ptr [rsp+16], rcx cvtdq2pd xmm0, qword ptr [rcx+0] cvtdq2pd xmm1, qword ptr [rcx+8] cvtdq2pd xmm2, qword ptr [rcx+16] diff --git a/src/asm/program_loop_store.inc b/src/asm/program_loop_store.inc index 1ba1635c..a894e3eb 100644 --- a/src/asm/program_loop_store.inc +++ b/src/asm/program_loop_store.inc @@ -1,4 +1,4 @@ - pop rcx + mov rcx, qword ptr [rsp+16] mov qword ptr [rcx+0], r8 mov qword ptr [rcx+8], r9 mov qword ptr [rcx+16], r10 @@ -7,7 +7,7 @@ mov qword ptr [rcx+40], r13 mov qword ptr [rcx+48], r14 mov qword ptr [rcx+56], r15 - pop rcx + mov rcx, qword ptr [rsp+8] xorpd xmm0, xmm4 xorpd xmm1, xmm5 xorpd xmm2, xmm6 diff --git a/src/asm/program_loop_store_hard_aes.inc b/src/asm/program_loop_store_hard_aes.inc new file mode 100644 index 00000000..f916de33 --- /dev/null +++ b/src/asm/program_loop_store_hard_aes.inc @@ -0,0 +1,30 @@ + mov rcx, qword ptr [rsp+16] + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + mov rcx, qword ptr [rsp+8] + aesenc xmm0, xmm4 + aesdec xmm1, xmm4 + aesenc xmm2, xmm4 + aesdec xmm3, xmm4 + aesenc xmm0, xmm5 + aesdec xmm1, xmm5 + aesenc xmm2, xmm5 + aesdec xmm3, xmm5 + aesenc xmm0, xmm6 + aesdec xmm1, xmm6 + aesenc xmm2, xmm6 + aesdec xmm3, xmm6 + aesenc xmm0, xmm7 + aesdec xmm1, xmm7 + aesenc xmm2, xmm7 + aesdec xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 diff --git a/src/asm/program_loop_store_soft_aes.inc b/src/asm/program_loop_store_soft_aes.inc new file mode 100644 index 00000000..b4e62499 --- /dev/null +++ b/src/asm/program_loop_store_soft_aes.inc @@ -0,0 +1,97 @@ + mov rcx, qword ptr [rsp+16] + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + mov rcx, qword ptr [rsp+8] + + movapd xmmword ptr [rsp+96], xmm0 + movapd xmmword ptr [rsp+112], xmm1 + movapd xmmword ptr [rsp+128], xmm2 + movapd xmmword ptr [rsp+144], xmm3 + movapd xmmword ptr [rsp+160], xmm4 + movapd xmmword ptr [rsp+176], xmm5 + movapd xmmword ptr [rsp+192], xmm6 + movapd xmmword ptr [rsp+208], xmm7 + + mov qword ptr [rsp+24], rax + mov qword ptr [rsp+40], rcx + mov qword ptr [rsp+48], rdx + mov qword ptr [rsp+56], rbp + mov qword ptr [rsp+72], rsi + mov qword ptr [rsp+80], rdi + mov qword ptr [rsp+88], rbx + + lea rsi, [rsp+160] + lea rdi, [rsp+96] + call soft_aes_enc + lea rsi, [rsp+160] + lea rdi, [rsp+112] + call soft_aes_dec + lea rsi, [rsp+160] + lea rdi, [rsp+128] + call soft_aes_enc + lea rsi, [rsp+160] + lea rdi, [rsp+144] + call soft_aes_dec + + lea rsi, [rsp+176] + lea rdi, [rsp+96] + call soft_aes_enc + lea rsi, [rsp+176] + lea rdi, [rsp+112] + call soft_aes_dec + lea rsi, [rsp+176] + lea rdi, [rsp+128] + call soft_aes_enc + lea rsi, [rsp+176] + lea rdi, [rsp+144] + call soft_aes_dec + + lea rsi, [rsp+192] + lea rdi, [rsp+96] + call soft_aes_enc + lea rsi, [rsp+192] + lea rdi, [rsp+112] + call soft_aes_dec + lea rsi, [rsp+192] + lea rdi, [rsp+128] + call soft_aes_enc + lea rsi, [rsp+192] + lea rdi, [rsp+144] + call soft_aes_dec + + lea rsi, [rsp+208] + lea rdi, [rsp+96] + call soft_aes_enc + lea rsi, [rsp+208] + lea rdi, [rsp+112] + call soft_aes_dec + lea rsi, [rsp+208] + lea rdi, [rsp+128] + call soft_aes_enc + lea rsi, [rsp+208] + lea rdi, [rsp+144] + call soft_aes_dec + + mov rax, qword ptr [rsp+24] + mov rcx, qword ptr [rsp+40] + mov rdx, qword ptr [rsp+48] + mov rbp, qword ptr [rsp+56] + mov rsi, qword ptr [rsp+72] + mov rdi, qword ptr [rsp+80] + mov rbx, qword ptr [rsp+88] + + movapd xmm0, xmmword ptr [rsp+96] + movapd xmm1, xmmword ptr [rsp+112] + movapd xmm2, xmmword ptr [rsp+128] + movapd xmm3, xmmword ptr [rsp+144] + + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 diff --git a/src/asm/program_prologue_linux.inc b/src/asm/program_prologue_linux.inc index 033584a7..fcab0549 100644 --- a/src/asm/program_prologue_linux.inc +++ b/src/asm/program_prologue_linux.inc @@ -1,18 +1,19 @@ ;# callee-saved registers - System V AMD64 ABI - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 + sub rsp, 456 + mov qword ptr [rsp+224], rbx + mov qword ptr [rsp+232], rbp + mov qword ptr [rsp+256], r12 + mov qword ptr [rsp+264], r13 + mov qword ptr [rsp+272], r14 + mov qword ptr [rsp+280], r15 ;# function arguments - mov rbx, rcx ;# loop counter - push rdi ;# RegisterFile& registerFile + mov rbx, rcx ;# loop counter + mov qword ptr [rsp+448], rdi ;# RegisterFile& registerFile mov rcx, rdi - mov rbp, qword ptr [rsi] ;# "mx", "ma" - mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset - mov rsi, rdx ;# uint8_t* scratchpad + mov rbp, qword ptr [rsi] ;# "mx", "ma" + mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset + mov rsi, rdx ;# uint8_t* scratchpad mov rax, rbp ror rbp, 32 diff --git a/src/asm/program_prologue_win64.inc b/src/asm/program_prologue_win64.inc index 10f21d37..4d3c5592 100644 --- a/src/asm/program_prologue_win64.inc +++ b/src/asm/program_prologue_win64.inc @@ -1,31 +1,30 @@ ;# callee-saved registers - Microsoft x64 calling convention - push rbx - push rbp - push rdi - push rsi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - movdqu xmmword ptr [rsp+64], xmm6 - movdqu xmmword ptr [rsp+48], xmm7 - movdqu xmmword ptr [rsp+32], xmm8 - movdqu xmmword ptr [rsp+16], xmm9 - movdqu xmmword ptr [rsp+0], xmm10 - sub rsp, 80 - movdqu xmmword ptr [rsp+64], xmm11 - movdqu xmmword ptr [rsp+48], xmm12 - movdqu xmmword ptr [rsp+32], xmm13 - movdqu xmmword ptr [rsp+16], xmm14 - movdqu xmmword ptr [rsp+0], xmm15 + sub rsp, 456 + mov qword ptr [rsp+224], rbx + mov qword ptr [rsp+232], rbp + mov qword ptr [rsp+240], rsi + mov qword ptr [rsp+248], rdi + mov qword ptr [rsp+256], r12 + mov qword ptr [rsp+264], r13 + mov qword ptr [rsp+272], r14 + mov qword ptr [rsp+280], r15 + movdqa xmmword ptr [rsp+288], xmm6 + movdqa xmmword ptr [rsp+304], xmm7 + movdqa xmmword ptr [rsp+320], xmm8 + movdqa xmmword ptr [rsp+336], xmm9 + movdqa xmmword ptr [rsp+352], xmm10 + movdqa xmmword ptr [rsp+368], xmm11 + movdqa xmmword ptr [rsp+384], xmm12 + movdqa xmmword ptr [rsp+400], xmm13 + movdqa xmmword ptr [rsp+416], xmm14 + movdqa xmmword ptr [rsp+432], xmm15 ;# function arguments - push rcx ;# RegisterFile& registerFile - mov rbp, qword ptr [rdx] ;# "mx", "ma" - mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset - mov rsi, r8 ;# uint8_t* scratchpad - mov rbx, r9 ;# loop counter + mov qword ptr [rsp+448], rcx ;# RegisterFile& registerFile + mov rbp, qword ptr [rdx] ;# "mx", "ma" + mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset + mov rsi, r8 ;# uint8_t* scratchpad + mov rbx, r9 ;# loop counter mov rax, rbp ror rbp, 32 diff --git a/src/asm/program_read_dataset_sshash_fin.inc b/src/asm/program_read_dataset_sshash_fin.inc index f5a067d2..da008d15 100644 --- a/src/asm/program_read_dataset_sshash_fin.inc +++ b/src/asm/program_read_dataset_sshash_fin.inc @@ -1,10 +1,9 @@ - mov rbx, qword ptr [rsp+64] - xor r8, qword ptr [rsp+56] - xor r9, qword ptr [rsp+48] + xor r8, qword ptr [rsp+24] + xor r9, qword ptr [rsp+32] xor r10, qword ptr [rsp+40] - xor r11, qword ptr [rsp+32] - xor r12, qword ptr [rsp+24] - xor r13, qword ptr [rsp+16] - xor r14, qword ptr [rsp+8] - xor r15, qword ptr [rsp+0] - add rsp, 72 \ No newline at end of file + xor r11, qword ptr [rsp+48] + xor r12, qword ptr [rsp+56] + xor r13, qword ptr [rsp+64] + xor r14, qword ptr [rsp+72] + xor r15, qword ptr [rsp+80] + mov rbx, qword ptr [rsp+88] diff --git a/src/asm/program_read_dataset_sshash_init.inc b/src/asm/program_read_dataset_sshash_init.inc index 9491f3d2..8ee558b2 100644 --- a/src/asm/program_read_dataset_sshash_init.inc +++ b/src/asm/program_read_dataset_sshash_init.inc @@ -1,13 +1,12 @@ - sub rsp, 72 - mov qword ptr [rsp+64], rbx - mov qword ptr [rsp+56], r8 - mov qword ptr [rsp+48], r9 + mov qword ptr [rsp+24], r8 + mov qword ptr [rsp+32], r9 mov qword ptr [rsp+40], r10 - mov qword ptr [rsp+32], r11 - mov qword ptr [rsp+24], r12 - mov qword ptr [rsp+16], r13 - mov qword ptr [rsp+8], r14 - mov qword ptr [rsp+0], r15 + mov qword ptr [rsp+48], r11 + mov qword ptr [rsp+56], r12 + mov qword ptr [rsp+64], r13 + mov qword ptr [rsp+72], r14 + mov qword ptr [rsp+80], r15 + mov qword ptr [rsp+88], rbx ror rbp, 32 ;# swap "ma" and "mx" xor rbp, rax ;# modify "mx" mov rbx, rbp ;# ebx = ma diff --git a/src/asm/program_soft_aes_dec.inc b/src/asm/program_soft_aes_dec.inc new file mode 100644 index 00000000..77975485 --- /dev/null +++ b/src/asm/program_soft_aes_dec.inc @@ -0,0 +1,49 @@ + mov eax, dword ptr [rsi+0] + mov ebx, dword ptr [rsi+4] + mov ecx, dword ptr [rsi+8] + mov edx, dword ptr [rsi+12] + + mov rsi, qword ptr [aes_lut_dec+RIP_REL] + + movzx ebp, byte ptr [rdi+0] + xor eax, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+1] + xor ebx, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+2] + xor ecx, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+3] + xor edx, dword ptr [rsi+rbp*4+3072] + + movzx ebp, byte ptr [rdi+4] + xor ebx, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+5] + xor ecx, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+6] + xor edx, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+7] + xor eax, dword ptr [rsi+rbp*4+3072] + + movzx ebp, byte ptr [rdi+8] + xor ecx, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+9] + xor edx, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+10] + xor eax, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+11] + xor ebx, dword ptr [rsi+rbp*4+3072] + + movzx ebp, byte ptr [rdi+12] + xor edx, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+13] + xor eax, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+14] + xor ebx, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+15] + xor ecx, dword ptr [rsi+rbp*4+3072] + + mov dword ptr [rdi+0], eax + mov dword ptr [rdi+4], ebx + mov dword ptr [rdi+8], ecx + mov dword ptr [rdi+12], edx + + ret diff --git a/src/asm/program_soft_aes_enc.inc b/src/asm/program_soft_aes_enc.inc new file mode 100644 index 00000000..c4c27b90 --- /dev/null +++ b/src/asm/program_soft_aes_enc.inc @@ -0,0 +1,49 @@ + mov eax, dword ptr [rsi+0] + mov ebx, dword ptr [rsi+4] + mov ecx, dword ptr [rsi+8] + mov edx, dword ptr [rsi+12] + + mov rsi, qword ptr [aes_lut_enc+RIP_REL] + + movzx ebp, byte ptr [rdi+0] + xor eax, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+1] + xor edx, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+2] + xor ecx, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+3] + xor ebx, dword ptr [rsi+rbp*4+3072] + + movzx ebp, byte ptr [rdi+4] + xor ebx, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+5] + xor eax, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+6] + xor edx, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+7] + xor ecx, dword ptr [rsi+rbp*4+3072] + + movzx ebp, byte ptr [rdi+8] + xor ecx, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+9] + xor ebx, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+10] + xor eax, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+11] + xor edx, dword ptr [rsi+rbp*4+3072] + + movzx ebp, byte ptr [rdi+12] + xor edx, dword ptr [rsi+rbp*4] + movzx ebp, byte ptr [rdi+13] + xor ecx, dword ptr [rsi+rbp*4+1024] + movzx ebp, byte ptr [rdi+14] + xor ebx, dword ptr [rsi+rbp*4+2048] + movzx ebp, byte ptr [rdi+15] + xor eax, dword ptr [rsi+rbp*4+3072] + + mov dword ptr [rdi+0], eax + mov dword ptr [rdi+4], ebx + mov dword ptr [rdi+8], ecx + mov dword ptr [rdi+12], edx + + ret diff --git a/src/bytecode_machine.hpp b/src/bytecode_machine.hpp index 5e82e0d3..d9d64426 100644 --- a/src/bytecode_machine.hpp +++ b/src/bytecode_machine.hpp @@ -259,7 +259,10 @@ namespace randomx { } static void exe_CFROUND(RANDOMX_EXE_ARGS) { - rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4); + uint64_t isrc = rotr(*ibc.isrc, ibc.imm); + if ((isrc & 60) == 0) { + rx_set_rounding_mode(isrc % 4); + } } static void exe_ISTORE(RANDOMX_EXE_ARGS) { diff --git a/src/common.hpp b/src/common.hpp index a77feb3b..a9ca6f93 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "blake2/endian.h" #include "configuration.h" #include "randomx.h" @@ -114,10 +115,14 @@ namespace randomx { #endif #endif + class SuperscalarProgram; + using SuperscalarProgramList = std::array; + #if defined(_M_X64) || defined(__x86_64__) #define RANDOMX_HAVE_COMPILER 1 + template class JitCompilerX86; - using JitCompiler = JitCompilerX86; + using JitCompiler = JitCompilerX86; #elif defined(__aarch64__) #define RANDOMX_HAVE_COMPILER 1 class JitCompilerA64; diff --git a/src/dataset.hpp b/src/dataset.hpp index d01911f9..26bc0b28 100644 --- a/src/dataset.hpp +++ b/src/dataset.hpp @@ -49,7 +49,7 @@ struct randomx_cache { randomx::JitCompiler* jit; randomx::CacheInitializeFunc* initialize; randomx::DatasetInitFunc* datasetInit; - randomx::SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES]; + randomx::SuperscalarProgramList programs; std::vector reciprocalCache; std::string cacheKey; randomx_argon2_impl* argonImpl; diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 8c09ae88..c000452e 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -124,6 +124,8 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { return _mm_castsi128_pd(_mm_set1_epi64x(x)); } +#define rx_cast_vec_i2f _mm_castsi128_pd +#define rx_cast_vec_f2i _mm_castpd_si128 #define rx_xor_vec_f128 _mm_xor_pd #define rx_and_vec_f128 _mm_and_pd #define rx_or_vec_f128 _mm_or_pd @@ -625,6 +627,16 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { return v; } +FORCE_INLINE rx_vec_f128 rx_cast_vec_i2f(rx_vec_i128 a) { + rx_vec_f128 x; + x.i = a; + return x; +} + +FORCE_INLINE rx_vec_i128 rx_cast_vec_f2i(rx_vec_f128 a) { + return a.i; +} + FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { rx_vec_f128 x; x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0]; diff --git a/src/jit_compiler_fallback.hpp b/src/jit_compiler_fallback.hpp index 57a6dbf9..4ed7b3b4 100644 --- a/src/jit_compiler_fallback.hpp +++ b/src/jit_compiler_fallback.hpp @@ -50,8 +50,8 @@ namespace randomx { void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { } - template - void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &) { + + void generateSuperscalarHash(SuperscalarProgramList& programs, std::vector &) { } void generateDatasetInitCode() { diff --git a/src/jit_compiler_x86.cpp b/src/jit_compiler_x86.cpp index 96c6492f..82c770a7 100644 --- a/src/jit_compiler_x86.cpp +++ b/src/jit_compiler_x86.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "program.hpp" #include "reciprocal.h" #include "virtual_memory.h" +#include "soft_aes.h" namespace randomx { /* @@ -76,6 +77,51 @@ namespace randomx { */ + /* + STACK LAYOUT (offsets from rsp): + 456 -> return address + 448 -> RegisterFile& registerFile + 432 -> xmm15 store + 416 -> xmm14 store + 400 -> xmm13 store + 384 -> xmm12 store + 368 -> xmm11 store + 352 -> xmm10 store + 336 -> xmm9 store + 320 -> xmm8 store + 304 -> xmm7 store + 288 -> xmm6 store + 280 -> r15 store + 272 -> r14 store + 264 -> r13 store + 256 -> r12 store + 248 -> rdi store + 240 -> rsi store + 232 -> rbp store + 224 -> rbx store + 208 -> xmm7 spill + 192 -> xmm6 spill + 176 -> xmm5 spill + 160 -> xmm4 spill + 144 -> xmm3 spill + 128 -> xmm2 spill + 112 -> xmm1 spill + 96 -> xmm0 spill + 88 -> rbx spill + 80 -> r15/rdi spill + 72 -> r14/rsi spill + 64 -> r13 spill + 56 -> r12/rbp spill + 48 -> r11/rdx spill + 40 -> r10/rcx spill + 32 -> r9 spill + 24 -> r8/rax spill + 16 -> spAddr1 + 8 -> spAddr0 + 4 -> (empty) + 0 -> mxcsr + */ + //Calculate the required code buffer size that is sufficient for the largest possible program: constexpr size_t MaxRandomXInstrCodeSize = 32; //FDIV_M requires up to 32 bytes of x86 code @@ -109,7 +155,11 @@ namespace randomx { const uint8_t* codeReadDatasetLightSshFin = ADDR(randomx_program_read_dataset_sshash_fin); const uint8_t* codeDatasetInit = ADDR(randomx_dataset_init); const uint8_t* codeLoopStore = ADDR(randomx_program_loop_store); + const uint8_t* codeLoopStoreHardAes = ADDR(randomx_program_loop_store_hard_aes); + const uint8_t* codeLoopStoreSoftAes = ADDR(randomx_program_loop_store_soft_aes); const uint8_t* codeLoopEnd = ADDR(randomx_program_loop_end); + const uint8_t* codeSoftAes = ADDR(randomx_program_soft_aes); + const uint8_t* codeSoftAesEnd = ADDR(randomx_program_soft_aes_end); const uint8_t* codeEpilogue = ADDR(randomx_program_epilogue); const uint8_t* codeProgramEnd = ADDR(randomx_program_end); const uint8_t* codeShhLoad = ADDR(randomx_sshash_load); @@ -122,7 +172,11 @@ namespace randomx { const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset; const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; - const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; + const int32_t loopStoreSize = codeLoopStoreHardAes - codeLoopStore; + const int32_t loopStoreHardAesSize = codeLoopStoreSoftAes - codeLoopStoreHardAes; + const int32_t loopStoreSoftAesSize = codeLoopEnd - codeLoopStoreSoftAes; + const int32_t softAesSize = codeSoftAesEnd - codeSoftAes; + const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; const int32_t epilogueSize = codeShhLoad - codeEpilogue; const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad; @@ -183,7 +237,7 @@ namespace randomx { static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f }; static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e }; static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; - static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x50, 0x0F, 0xAE, 0x14, 0x24, 0x58 }; + static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0d, 0xc0, 0x9f, 0x00, 0x00, 0x89, 0x04, 0x24, 0x0f, 0xae, 0x14, 0x24 }; static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; @@ -193,6 +247,7 @@ namespace randomx { static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 }; static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; static const uint8_t JNZ[] = { 0x0f, 0x85 }; + static const uint8_t JNZ_SHORT = 0x75; static const uint8_t JMP = 0xe9; static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; @@ -207,6 +262,7 @@ namespace randomx { static const uint8_t LEA_32[] = { 0x41, 0x8d }; static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 }; static const uint8_t ADD_EBX_I[] = { 0x81, 0xc3 }; + static const uint8_t TEST_EAX_60SL13[] = { 0xa9, 0x00, 0x80, 0x07, 0x00 }; static const uint8_t NOP1[] = { 0x90 }; static const uint8_t NOP2[] = { 0x66, 0x90 }; @@ -219,11 +275,24 @@ namespace randomx { static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; - size_t JitCompilerX86::getCodeSize() { + template + size_t JitCompilerX86::getCodeSize() { return CodeSize; } - JitCompilerX86::JitCompilerX86() { + template + void JitCompilerX86::alignCode(int align) { + int rem = codePos % align; + while (rem != 0) { + int nopSize = align - rem; + if (nopSize > 8) nopSize = 8; + emit(NOPX[nopSize - 1], nopSize); + rem = codePos % align; + } + } + + template + JitCompilerX86::JitCompilerX86() { code = (uint8_t*)allocMemoryPages(CodeSize); if (code == nullptr) throw std::runtime_error("allocMemoryPages"); @@ -231,30 +300,36 @@ namespace randomx { memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); } - JitCompilerX86::~JitCompilerX86() { + template + JitCompilerX86::~JitCompilerX86() { freePagedMemory(code, CodeSize); } - void JitCompilerX86::enableAll() { + template + void JitCompilerX86::enableAll() { setPagesRWX(code, CodeSize); } - void JitCompilerX86::enableWriting() { + template + void JitCompilerX86::enableWriting() { setPagesRW(code, CodeSize); } - void JitCompilerX86::enableExecution() { + template + void JitCompilerX86::enableExecution() { setPagesRX(code, CodeSize); } - void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + template + void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { generateProgramPrologue(prog, pcfg); memcpy(code + codePos, codeReadDataset, readDatasetSize); codePos += readDatasetSize; generateProgramEpilogue(prog, pcfg); } - void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + template + void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { generateProgramPrologue(prog, pcfg); emit(codeReadDatasetLightSshInit, readDatasetLightInitSize); emit(ADD_EBX_I); @@ -265,43 +340,36 @@ namespace randomx { generateProgramEpilogue(prog, pcfg); } - template - void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &reciprocalCache) { + template + void JitCompilerX86::generateSuperscalarHash(SuperscalarProgramList &programs, std::vector &reciprocalCache) { memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); codePos = superScalarHashOffset + codeSshInitSize; - for (unsigned j = 0; j < N; ++j) { + for (unsigned j = 0; j < programs.size(); ++j) { SuperscalarProgram& prog = programs[j]; for (unsigned i = 0; i < prog.getSize(); ++i) { Instruction& instr = prog(i); generateSuperscalarCode(instr, reciprocalCache); } emit(codeShhLoad, codeSshLoadSize); - if (j < N - 1) { + if (j < programs.size() - 1) { emit(REX_MOV_RR64); emitByte(0xd8 + prog.getAddressRegister()); emit(codeShhPrefetch, codeSshPrefetchSize); #ifdef RANDOMX_ALIGN - int align = (codePos % 16); - while (align != 0) { - int nopSize = 16 - align; - if (nopSize > 8) nopSize = 8; - emit(NOPX[nopSize - 1], nopSize); - align = (codePos % 16); - } + alignCode(16); #endif } } emitByte(RET); } - template - void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_ACCESSES], std::vector &reciprocalCache); - - void JitCompilerX86::generateDatasetInitCode() { + template + void JitCompilerX86::generateDatasetInitCode() { memcpy(code, codeDatasetInit, datasetInitSize); } - void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { + template + void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { instructionOffsets.clear(); for (unsigned i = 0; i < RegistersCount; ++i) { registerUsage[i] = -1; @@ -323,28 +391,49 @@ namespace randomx { emitByte(0xc0 + pcfg.readReg3); } - void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) { + template + void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) { emit(REX_MOV_RR64); emitByte(0xc0 + pcfg.readReg0); emit(REX_XOR_RAX_R64); emitByte(0xc0 + pcfg.readReg1); emit(ADDR(randomx_prefetch_scratchpad), ADDR(randomx_prefetch_scratchpad_end) - ADDR(randomx_prefetch_scratchpad)); - memcpy(code + codePos, codeLoopStore, loopStoreSize); - codePos += loopStoreSize; + if (vmFlags & RANDOMX_FLAG_V2) { + if (vmFlags & RANDOMX_FLAG_HARD_AES) { + memcpy(code + codePos, codeLoopStoreHardAes, loopStoreHardAesSize); + codePos += loopStoreHardAesSize; + } + else { + memcpy(code + codePos, codeLoopStoreSoftAes, loopStoreSoftAesSize); + codePos += loopStoreSoftAesSize; + } + } + else { + memcpy(code + codePos, codeLoopStore, loopStoreSize); + codePos += loopStoreSize; + } emit(SUB_EBX); emit(JNZ); emit32(prologueSize - codePos - 4); emitByte(JMP); emit32(epilogueOffset - codePos - 4); + if ((vmFlags & RANDOMX_FLAG_V2) && !(vmFlags & RANDOMX_FLAG_HARD_AES)) { + memcpy(code + codePos, codeSoftAes, softAesSize); + codePos += softAesSize; + emit64((uint64_t)randomx_aes_lut_enc); + emit64((uint64_t)randomx_aes_lut_dec); + } } - void JitCompilerX86::generateCode(Instruction& instr, int i) { + template + void JitCompilerX86::generateCode(Instruction& instr, int i) { instructionOffsets.push_back(codePos); auto generator = engine[instr.opcode]; (this->*generator)(instr, i); } - void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector &reciprocalCache) { + template + void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector &reciprocalCache) { switch ((SuperscalarInstructionType)instr.opcode) { case randomx::SuperscalarInstructionType::ISUB_R: @@ -438,7 +527,8 @@ namespace randomx { } } - void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { + template + void JitCompilerX86::genAddressReg(Instruction& instr, bool rax) { emit(LEA_32); emitByte(0x80 + instr.src + (rax ? 0 : 8)); if (instr.src == RegisterNeedsSib) { @@ -452,7 +542,8 @@ namespace randomx { emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); } - void JitCompilerX86::genAddressRegDst(Instruction& instr) { + template + void JitCompilerX86::genAddressRegDst(Instruction& instr) { emit(LEA_32); emitByte(0x80 + instr.dst); if (instr.dst == RegisterNeedsSib) { @@ -468,11 +559,13 @@ namespace randomx { } } - void JitCompilerX86::genAddressImm(Instruction& instr) { + template + void JitCompilerX86::genAddressImm(Instruction& instr) { emit32(instr.getImm32() & ScratchpadL3Mask); } - void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { + template + void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; emit(REX_LEA); if (instr.dst == RegisterNeedsDisplacement) @@ -484,10 +577,11 @@ namespace randomx { emit32(instr.getImm32()); } - void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { - genAddressReg(instr); + genAddressReg(instr, true); emit(REX_ADD_RM); emitByte(0x04 + 8 * instr.dst); emitByte(0x06); @@ -499,11 +593,13 @@ namespace randomx { } } - void JitCompilerX86::genSIB(int scale, int index, int base) { + template + void JitCompilerX86::genSIB(int scale, int index, int base) { emitByte((scale << 6) | (index << 3) | base); } - void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_SUB_RR); @@ -516,10 +612,11 @@ namespace randomx { } } - void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { - genAddressReg(instr); + genAddressReg(instr, true); emit(REX_SUB_RM); emitByte(0x04 + 8 * instr.dst); emitByte(0x06); @@ -531,7 +628,8 @@ namespace randomx { } } - void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_IMUL_RR); @@ -544,10 +642,11 @@ namespace randomx { } } - void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { - genAddressReg(instr); + genAddressReg(instr, true); emit(REX_IMUL_RM); emitByte(0x04 + 8 * instr.dst); emitByte(0x06); @@ -559,7 +658,8 @@ namespace randomx { } } - void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); @@ -569,7 +669,8 @@ namespace randomx { emitByte(0xc2 + 8 * instr.dst); } - void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, false); @@ -588,7 +689,8 @@ namespace randomx { emitByte(0xc2 + 8 * instr.dst); } - void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; emit(REX_MOV_RR64); emitByte(0xc0 + instr.dst); @@ -598,7 +700,8 @@ namespace randomx { emitByte(0xc2 + 8 * instr.dst); } - void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { genAddressReg(instr, false); @@ -617,7 +720,8 @@ namespace randomx { emitByte(0xc2 + 8 * instr.dst); } - void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { + template + void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { uint64_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; @@ -628,13 +732,15 @@ namespace randomx { } } - void JitCompilerX86::h_INEG_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_INEG_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; emit(REX_NEG); emitByte(0xd8 + instr.dst); } - void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_XOR_RR); @@ -647,10 +753,11 @@ namespace randomx { } } - void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { - genAddressReg(instr); + genAddressReg(instr, true); emit(REX_XOR_RM); emitByte(0x04 + 8 * instr.dst); emitByte(0x06); @@ -662,7 +769,8 @@ namespace randomx { } } - void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_MOV_RR); @@ -677,7 +785,8 @@ namespace randomx { } } - void JitCompilerX86::h_IROL_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_IROL_R(Instruction& instr, int i) { registerUsage[instr.dst] = i; if (instr.src != instr.dst) { emit(REX_MOV_RR); @@ -692,7 +801,8 @@ namespace randomx { } } - void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) { if (instr.src != instr.dst) { registerUsage[instr.dst] = i; registerUsage[instr.src] = i; @@ -701,82 +811,106 @@ namespace randomx { } } - void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) { emit(SHUFPD); emitByte(0xc0 + 9 * instr.dst); emitByte(1); } - void JitCompilerX86::h_FADD_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_FADD_R(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; instr.src %= RegisterCountFlt; emit(REX_ADDPD); emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_FADD_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_FADD_M(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; - genAddressReg(instr); + genAddressReg(instr, true); emit(REX_CVTDQ2PD_XMM12); emit(REX_ADDPD); emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; instr.src %= RegisterCountFlt; emit(REX_SUBPD); emitByte(0xc0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; - genAddressReg(instr); + genAddressReg(instr, true); emit(REX_CVTDQ2PD_XMM12); emit(REX_SUBPD); emitByte(0xc4 + 8 * instr.dst); } - void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; emit(REX_XORPS); emitByte(0xc7 + 8 * instr.dst); } - void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; instr.src %= RegisterCountFlt; emit(REX_MULPD); emitByte(0xe0 + instr.src + 8 * instr.dst); } - void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) { + template + void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; - genAddressReg(instr); + genAddressReg(instr, true); emit(REX_CVTDQ2PD_XMM12); emit(REX_ANDPS_XMM12); emit(REX_DIVPD); emitByte(0xe4 + 8 * instr.dst); } - void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) { + template + void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) { instr.dst %= RegisterCountFlt; emit(SQRTPD); emitByte(0xe4 + 9 * instr.dst); } - void JitCompilerX86::h_CFROUND(Instruction& instr, int i) { + template + void JitCompilerX86::h_CFROUND(Instruction& instr, int i) { + //mov rax {src} emit(REX_MOV_RR64); emitByte(0xc0 + instr.src); int rotate = (13 - (instr.getImm32() & 63)) & 63; if (rotate != 0) { + //rol rax, {imm32} emit(ROL_RAX); emitByte(rotate); } + if (vmFlags & RANDOMX_FLAG_V2) { + //test eax, 491520 + //jnz next + emit(TEST_EAX_60SL13); + emitByte(JNZ_SHORT); + emitByte(sizeof(AND_OR_MOV_LDMXCSR)); + } + //and eax, 24576 + //or eax, 40896 + //mov dword ptr [rsp], eax + //ldmxcsr dword ptr [rsp] emit(AND_OR_MOV_LDMXCSR); } - void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) { + template + void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) { int reg = instr.dst; int target = registerUsage[reg] + 1; emit(REX_ADD_I); @@ -797,21 +931,24 @@ namespace randomx { } } - void JitCompilerX86::h_ISTORE(Instruction& instr, int i) { + template + void JitCompilerX86::h_ISTORE(Instruction& instr, int i) { genAddressRegDst(instr); emit(REX_MOV_MR); emitByte(0x04 + 8 * instr.src); emitByte(0x06); } - void JitCompilerX86::h_NOP(Instruction& instr, int i) { + template + void JitCompilerX86::h_NOP(Instruction& instr, int i) { emit(NOP1); } #include "instruction_weights.hpp" #define INST_HANDLE(x) REPN(&JitCompilerX86::h_##x, WT(x)) - InstructionGeneratorX86 JitCompilerX86::engine[256] = { + template + InstructionGeneratorX86 JitCompilerX86::engine[256] = { INST_HANDLE(IADD_RS) INST_HANDLE(IADD_M) INST_HANDLE(ISUB_R) @@ -844,4 +981,7 @@ namespace randomx { INST_HANDLE(NOP) }; + template class JitCompilerX86; + template class JitCompilerX86; + template class JitCompilerX86; } diff --git a/src/jit_compiler_x86.hpp b/src/jit_compiler_x86.hpp index e95685f9..20884660 100644 --- a/src/jit_compiler_x86.hpp +++ b/src/jit_compiler_x86.hpp @@ -38,19 +38,21 @@ namespace randomx { class Program; struct ProgramConfiguration; class SuperscalarProgram; + template class JitCompilerX86; class Instruction; - typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); + template + using InstructionGeneratorX86 = void(JitCompilerX86::*)(Instruction&, int); + template class JitCompilerX86 { public: JitCompilerX86(); ~JitCompilerX86(); void generateProgram(Program&, ProgramConfiguration&); void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); - template - void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector &); + void generateSuperscalarHash(SuperscalarProgramList &programs, std::vector &); void generateDatasetInitCode(); ProgramFunc* getProgramFunc() { return (ProgramFunc*)code; @@ -66,7 +68,7 @@ namespace randomx { void enableExecution(); void enableAll(); private: - static InstructionGeneratorX86 engine[256]; + static InstructionGeneratorX86 engine[256]; std::vector instructionOffsets; int registerUsage[RegistersCount]; uint8_t* code; @@ -82,6 +84,8 @@ namespace randomx { void generateCode(Instruction&, int); void generateSuperscalarCode(Instruction &, std::vector &); + void alignCode(int align); + void emitByte(uint8_t val) { code[codePos] = val; codePos++; diff --git a/src/jit_compiler_x86_static.S b/src/jit_compiler_x86_static.S index 9193b21a..6d0f1dde 100644 --- a/src/jit_compiler_x86_static.S +++ b/src/jit_compiler_x86_static.S @@ -47,7 +47,11 @@ .global DECL(randomx_program_read_dataset_sshash_init) .global DECL(randomx_program_read_dataset_sshash_fin) .global DECL(randomx_program_loop_store) +.global DECL(randomx_program_loop_store_hard_aes) +.global DECL(randomx_program_loop_store_soft_aes) .global DECL(randomx_program_loop_end) +.global DECL(randomx_program_soft_aes) +.global DECL(randomx_program_soft_aes_end) .global DECL(randomx_dataset_init) .global DECL(randomx_program_epilogue) .global DECL(randomx_sshash_load) @@ -64,6 +68,7 @@ #define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1) #define RANDOMX_ALIGN 4096 #define SUPERSCALAR_OFFSET ((((RANDOMX_ALIGN + 32 * RANDOMX_PROGRAM_SIZE) - 1) / (RANDOMX_ALIGN) + 1) * (RANDOMX_ALIGN)) +#define RIP_REL rip #define db .byte @@ -119,8 +124,28 @@ DECL(randomx_program_read_dataset_sshash_fin): DECL(randomx_program_loop_store): #include "asm/program_loop_store.inc" +DECL(randomx_program_loop_store_hard_aes): + #include "asm/program_loop_store_hard_aes.inc" + +DECL(randomx_program_loop_store_soft_aes): + #include "asm/program_loop_store_soft_aes.inc" + DECL(randomx_program_loop_end): - nop + sub ebx, 1 + jnz rx_program_end + jmp rx_program_end + +DECL(randomx_program_soft_aes): +soft_aes_enc: + #include "asm/program_soft_aes_enc.inc" +soft_aes_dec: + #include "asm/program_soft_aes_dec.inc" + +DECL(randomx_program_soft_aes_end): +aes_lut_enc: + db 0, 0, 0, 0, 0, 0, 0, 0 +aes_lut_dec: + db 0, 0, 0, 0, 0, 0, 0, 0 .balign 64 DECL(randomx_dataset_init): @@ -223,6 +248,7 @@ DECL(randomx_program_end): rx_program_end: nop +.balign 64 DECL(randomx_reciprocal_fast): #if !defined(WINABI) mov rcx, rdi diff --git a/src/jit_compiler_x86_static.asm b/src/jit_compiler_x86_static.asm index a37c716c..950e57f3 100644 --- a/src/jit_compiler_x86_static.asm +++ b/src/jit_compiler_x86_static.asm @@ -39,7 +39,11 @@ PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_dataset_init PUBLIC randomx_program_loop_store +PUBLIC randomx_program_loop_store_hard_aes +PUBLIC randomx_program_loop_store_soft_aes PUBLIC randomx_program_loop_end +PUBLIC randomx_program_soft_aes +PUBLIC randomx_program_soft_aes_end PUBLIC randomx_program_epilogue PUBLIC randomx_sshash_load PUBLIC randomx_sshash_prefetch @@ -55,6 +59,7 @@ RANDOMX_DATASET_BASE_MASK EQU (RANDOMX_DATASET_BASE_SIZE-64) RANDOMX_CACHE_MASK EQU (RANDOMX_ARGON_MEMORY*16-1) RANDOMX_ALIGN EQU 4096 SUPERSCALAR_OFFSET EQU ((((RANDOMX_ALIGN + 32 * RANDOMX_PROGRAM_SIZE) - 1) / (RANDOMX_ALIGN) + 1) * (RANDOMX_ALIGN)) +RIP_REL EQU 0 randomx_prefetch_scratchpad PROC mov rdx, rax @@ -114,10 +119,34 @@ randomx_program_loop_store PROC include asm/program_loop_store.inc randomx_program_loop_store ENDP +randomx_program_loop_store_hard_aes PROC + include asm/program_loop_store_hard_aes.inc +randomx_program_loop_store_hard_aes ENDP + +randomx_program_loop_store_soft_aes PROC + include asm/program_loop_store_soft_aes.inc +randomx_program_loop_store_soft_aes ENDP + randomx_program_loop_end PROC - nop + sub ebx, 1 + jnz rx_program_end + jmp rx_program_end randomx_program_loop_end ENDP +randomx_program_soft_aes PROC +soft_aes_enc:: + include asm/program_soft_aes_enc.inc +soft_aes_dec:: + include asm/program_soft_aes_dec.inc +randomx_program_soft_aes ENDP + +randomx_program_soft_aes_end PROC +aes_lut_enc:: + db 0, 0, 0, 0, 0, 0, 0, 0 +aes_lut_dec:: + db 0, 0, 0, 0, 0, 0, 0, 0 +randomx_program_soft_aes_end ENDP + ALIGN 64 randomx_dataset_init PROC push rbx @@ -212,6 +241,7 @@ rx_program_end:: nop randomx_program_end ENDP +ALIGN 64 randomx_reciprocal_fast PROC include asm/randomx_reciprocal.inc randomx_reciprocal_fast ENDP diff --git a/src/jit_compiler_x86_static.hpp b/src/jit_compiler_x86_static.hpp index fe32a8b7..2e8ba8b0 100644 --- a/src/jit_compiler_x86_static.hpp +++ b/src/jit_compiler_x86_static.hpp @@ -39,7 +39,11 @@ extern "C" { void randomx_program_read_dataset_sshash_init(); void randomx_program_read_dataset_sshash_fin(); void randomx_program_loop_store(); + void randomx_program_loop_store_hard_aes(); + void randomx_program_loop_store_soft_aes(); void randomx_program_loop_end(); + void randomx_program_soft_aes(); + void randomx_program_soft_aes_end(); void randomx_dataset_init(); void randomx_program_epilogue(); void randomx_sshash_load(); diff --git a/src/randomx.h b/src/randomx.h index 313bcd2e..ed0cf4a1 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -48,7 +48,8 @@ typedef enum { RANDOMX_FLAG_SECURE = 16, RANDOMX_FLAG_ARGON2_SSSE3 = 32, RANDOMX_FLAG_ARGON2_AVX2 = 64, - RANDOMX_FLAG_ARGON2 = 96 + RANDOMX_FLAG_ARGON2 = 96, + RANDOMX_FLAG_V2 = 128, } randomx_flags; typedef struct randomx_dataset randomx_dataset; diff --git a/src/soft_aes.cpp b/src/soft_aes.cpp index 3e82fa2e..a9078445 100644 --- a/src/soft_aes.cpp +++ b/src/soft_aes.cpp @@ -28,26 +28,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "soft_aes.h" -alignas(16) const uint8_t sbox[256] = { - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, -}; - -alignas(16) const uint32_t lutEnc0[256] = { +extern "C" const uint32_t randomx_aes_lut_enc[4][256] = { +{ 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, 0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56, 0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec, 0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa, 0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb, @@ -80,9 +62,8 @@ alignas(16) const uint32_t lutEnc0[256] = { 0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9, 0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5, 0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0, 0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e, 0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c, -}; - -alignas(16) const uint32_t lutEnc1[256] = { +}, +{ 0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d, 0xf2f2ff0d, 0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154, 0x30306050, 0x01010203, 0x6767cea9, 0x2b2b567d, 0xfefee719, 0xd7d7b562, 0xabab4de6, 0x7676ec9a, 0xcaca8f45, 0x82821f9d, 0xc9c98940, 0x7d7dfa87, 0xfafaef15, 0x5959b2eb, 0x47478ec9, 0xf0f0fb0b, @@ -115,9 +96,8 @@ alignas(16) const uint32_t lutEnc1[256] = { 0x9b9b2db6, 0x1e1e3c22, 0x87871592, 0xe9e9c920, 0xcece8749, 0x5555aaff, 0x28285078, 0xdfdfa57a, 0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17, 0xbfbf65da, 0xe6e6d731, 0x424284c6, 0x6868d0b8, 0x414182c3, 0x999929b0, 0x2d2d5a77, 0x0f0f1e11, 0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6, 0x16162c3a, -}; - -alignas(16) const uint32_t lutEnc2[256] = { +}, +{ 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b, 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5, 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b, 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76, 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d, 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0, @@ -150,9 +130,8 @@ alignas(16) const uint32_t lutEnc2[256] = { 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9, 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf, 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d, 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868, 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f, 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16, -}; - -alignas(16) const uint32_t lutEnc3[256] = { +}, +{ 0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b, 0xff0df2f2, 0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5, 0x60503030, 0x02030101, 0xcea96767, 0x567d2b2b, 0xe719fefe, 0xb562d7d7, 0x4de6abab, 0xec9a7676, 0x8f45caca, 0x1f9d8282, 0x8940c9c9, 0xfa877d7d, 0xef15fafa, 0xb2eb5959, 0x8ec94747, 0xfb0bf0f0, @@ -185,9 +164,10 @@ alignas(16) const uint32_t lutEnc3[256] = { 0x2db69b9b, 0x3c221e1e, 0x15928787, 0xc920e9e9, 0x8749cece, 0xaaff5555, 0x50782828, 0xa57adfdf, 0x038f8c8c, 0x59f8a1a1, 0x09808989, 0x1a170d0d, 0x65dabfbf, 0xd731e6e6, 0x84c64242, 0xd0b86868, 0x82c34141, 0x29b09999, 0x5a772d2d, 0x1e110f0f, 0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb, 0x2c3a1616, -}; +} }; -alignas(16) const uint32_t lutDec0[256] = { +extern "C" const uint32_t randomx_aes_lut_dec[4][256] = { +{ 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, 0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5, 0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b, @@ -220,9 +200,8 @@ alignas(16) const uint32_t lutDec0[256] = { 0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478, 0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff, 0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0, -}; - -alignas(16) const uint32_t lutDec1[256] = { +}, +{ 0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93, 0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f, 0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6, @@ -255,9 +234,8 @@ alignas(16) const uint32_t lutDec1[256] = { 0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886, 0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41, 0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042, -}; - -alignas(16) const uint32_t lutDec2[256] = { +}, +{ 0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303, 0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3, 0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9, @@ -290,9 +268,8 @@ alignas(16) const uint32_t lutDec2[256] = { 0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db, 0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195, 0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257, -}; - -alignas(16) const uint32_t lutDec3[256] = { +}, +{ 0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3, 0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362, 0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3, @@ -325,7 +302,12 @@ alignas(16) const uint32_t lutDec3[256] = { 0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44, 0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d, 0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8, -}; +} }; + +#define lutEnc0 randomx_aes_lut_enc[0] +#define lutEnc1 randomx_aes_lut_enc[1] +#define lutEnc2 randomx_aes_lut_enc[2] +#define lutEnc3 randomx_aes_lut_enc[3] rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) { uint32_t s0, s1, s2, s3; @@ -345,6 +327,11 @@ rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) { return rx_xor_vec_i128(out, key); } +#define lutDec0 randomx_aes_lut_dec[0] +#define lutDec1 randomx_aes_lut_dec[1] +#define lutDec2 randomx_aes_lut_dec[2] +#define lutDec3 randomx_aes_lut_dec[3] + rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) { uint32_t s0, s1, s2, s3; diff --git a/src/soft_aes.h b/src/soft_aes.h index 254f8d63..7acd295b 100644 --- a/src/soft_aes.h +++ b/src/soft_aes.h @@ -31,6 +31,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "intrin_portable.h" +extern "C" const uint32_t randomx_aes_lut_enc[4][256]; +extern "C" const uint32_t randomx_aes_lut_dec[4][256]; + rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key); rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key); diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index d25d0c2c..29ec404a 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -395,7 +395,7 @@ int main(int argc, char** argv) { std::cout << "Calculated result: "; result.print(std::cout); if (noncesCount == 1000 && seedValue == 0 && !commit) - std::cout << "Reference result: 10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl; + std::cout << "Reference result: ff5326fbba7402e7af3373b25f10dbf71be0a4be91fc5a0db6af8b9faf708ed3" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl; } diff --git a/src/vm_interpreted.cpp b/src/vm_interpreted.cpp index 64243c3e..aa0c6235 100644 --- a/src/vm_interpreted.cpp +++ b/src/vm_interpreted.cpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dataset.hpp" #include "intrin_portable.h" #include "reciprocal.h" +#include "soft_aes.h" namespace randomx { @@ -92,11 +93,28 @@ namespace randomx { for (unsigned i = 0; i < RegistersCount; ++i) store64(scratchpad + spAddr1 + 8 * i, nreg.r[i]); + rx_vec_i128 ekey[RegisterCountFlt]; + rx_vec_i128 freg[RegisterCountFlt]; + + for (unsigned i = 0; i < RegisterCountFlt; ++i) { + ekey[i] = rx_cast_vec_f2i(nreg.e[i]); + freg[i] = rx_cast_vec_f2i(nreg.f[i]); + } + + for (unsigned i = 0; i < RegisterCountFlt; ++i) { + freg[0] = aesenc(freg[0], ekey[i]); + freg[1] = aesdec(freg[1], ekey[i]); + freg[2] = aesenc(freg[2], ekey[i]); + freg[3] = aesdec(freg[3], ekey[i]); + } + for (unsigned i = 0; i < RegisterCountFlt; ++i) - nreg.f[i] = rx_xor_vec_f128(nreg.f[i], nreg.e[i]); + nreg.f[i] = rx_cast_vec_i2f(freg[i]); + + uint8_t* fStoreAddr = scratchpad + spAddr0; for (unsigned i = 0; i < RegisterCountFlt; ++i) - rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), nreg.f[i]); + rx_store_vec_f128((double*)(fStoreAddr + 16 * i), nreg.f[i]); spAddr0 = 0; spAddr1 = 0;