Skip to content

Commit

Permalink
remove simd_memcpy with official memcpy
Browse files Browse the repository at this point in the history
Signed-off-by: Duc Tri Nguyen <[email protected]>
  • Loading branch information
cothan committed Jun 10, 2024
1 parent de77796 commit f1c232d
Showing 1 changed file with 11 additions and 75 deletions.
86 changes: 11 additions & 75 deletions mlkem/asm/clean/rej_uniform_asm.s
Original file line number Diff line number Diff line change
@@ -1,78 +1,13 @@
// SPDX-License-Identifier: Apache-2.0

/*************************************************
* Name: simd_memcpy
*
* Function: void simd_mempcy(void *dst, void *src, size_t length);
* Description: optimized memcpy with SIMD instructions for small data
*
* Arguments: - void *dst: pointer to destination buffer
* - void *src: pointer to source buffer
* - size_t len: length of src in bytes
**************************************************/
.align 4
.global simd_memcpy
.global _simd_memcpy
simd_memcpy:
_simd_memcpy:
/* Input registers */
dst .req x0
src .req x1
copylen .req x2

/* Temporary registers, no need to preserve */
data .req w12
aligned_addr .req x13
aligned_len .req x14
unaligned_len .req x15

cmp copylen, #0 // If copylen is less than or equal to 0
b.le end // then we exit the function
cmp copylen, #32 // Check if size is less than 32 bytes
b.lo small_copy // Branch to small copy if size is less than 32

// Align destination address
mov aligned_addr, dst
and aligned_addr, aligned_addr, #15
cbz aligned_addr, aligned_copy // If already aligned, jump to aligned copy

mov unaligned_len, #16
sub unaligned_len, unaligned_len, aligned_addr
cmp copylen, unaligned_len
b.lo small_copy // If size is smaller than alignment padding, use small copy
sub copylen, copylen, unaligned_len

small_align:
ldrb data, [src], #1
strb data, [dst], #1
subs unaligned_len, unaligned_len, #1
b.ne small_align

aligned_copy:
// Main copy loop using 128-bit Neon registers
lsr aligned_len, copylen, #5 // Calculate the number of 32-byte chunks
cbz aligned_len, tail_copy // If no chunks, jump to tail copy

copy_loop:
ldp q0, q1, [src], #32
stp q0, q1, [dst], #32
subs aligned_len, aligned_len, #1
b.ne copy_loop

tail_copy:
// Copy any remaining bytes less than 32 bytes
and copylen, copylen, #31
cbz copylen, end

small_copy:
ldrb data, [src], #1
strb data, [dst], #1
subs copylen, copylen, #1
b.ne small_copy

end:
ret

#ifdef __APPLE__
.macro MEMCPY dst, src, len
bl _memcpy
.endm
#else
.macro MEMCPY dst, src, len
bl memcpy
.endm
#endif
/*************************************************
* Name: rej_uniform_asm
*
Expand Down Expand Up @@ -362,7 +297,8 @@ _rej_uniform_asm:

ubfiz x2, min, #1, #32
mov x1, sp_copy
bl simd_memcpy

MEMCPY x0, x1, x2

return:

Expand Down

0 comments on commit f1c232d

Please sign in to comment.