Skip to content

Commit

Permalink
AArch64: Add native poly_frombytes() implementation
Browse files Browse the repository at this point in the history
This commit adds an AArch64 implementation for `poly_frombytes()`.
Like the already existing `poly_tobytes()`, we do not yet optimize
it using SLOTHY, but work with the clean version in both the clean
ahd the optimized backend. Applying SLOTHY to both needs work
on the (micro)architecture models first.

Signed-off-by: Hanno Becker <[email protected]>
  • Loading branch information
hanno-becker committed Dec 17, 2024
1 parent 2dd10c1 commit 08da95f
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 0 deletions.
6 changes: 6 additions & 0 deletions mlkem/native/aarch64/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a);
#define poly_tobytes_asm_opt MLKEM_NAMESPACE(poly_tobytes_asm_opt)
void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a);

#define poly_frombytes_asm_clean MLKEM_NAMESPACE(poly_frombytes_asm_clean)
void poly_frombytes_asm_clean(int16_t *a, uint8_t const *r);

#define poly_frombytes_asm_opt MLKEM_NAMESPACE(poly_frombytes_asm_opt)
void poly_frombytes_asm_opt(int16_t *a, uint8_t const *r);

#define polyvec_basemul_acc_montgomery_cached_asm_clean \
MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean)
void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r,
Expand Down
7 changes: 7 additions & 0 deletions mlkem/native/aarch64/src/clean_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE
#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
#define MLKEM_USE_NATIVE_POLY_TOBYTES
#define MLKEM_USE_NATIVE_POLY_FROMBYTES
#define MLKEM_USE_NATIVE_REJ_UNIFORM

static INLINE void ntt_native(poly *data)
Expand Down Expand Up @@ -64,6 +65,12 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
poly_tobytes_asm_clean(r, a->coeffs);
}

static INLINE void poly_frombytes_native(poly *a,
uint8_t const r[MLKEM_POLYBYTES])
{
poly_frombytes_asm_clean(a->coeffs, r);
}

static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
const uint8_t *buf, unsigned int buflen)
{
Expand Down
7 changes: 7 additions & 0 deletions mlkem/native/aarch64/src/opt_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE
#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
#define MLKEM_USE_NATIVE_POLY_TOBYTES
#define MLKEM_USE_NATIVE_POLY_FROMBYTES
#define MLKEM_USE_NATIVE_REJ_UNIFORM

#define NTT_BOUND_NATIVE (6 * MLKEM_Q)
Expand Down Expand Up @@ -65,6 +66,12 @@ static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
poly_tobytes_asm_opt(r, a->coeffs);
}

static INLINE void poly_frombytes_native(poly *a,
uint8_t const r[MLKEM_POLYBYTES])
{
poly_frombytes_asm_opt(a->coeffs, r);
}

static INLINE int rej_uniform_native(int16_t *r, unsigned int len,
const uint8_t *buf, unsigned int buflen)
{
Expand Down
55 changes: 55 additions & 0 deletions mlkem/native/aarch64/src/poly_clean.S
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,61 @@ poly_tobytes_asm_clean_asm_loop_start:
.unreq src
.unreq count

/********************************************
* poly_frombytes() *
********************************************/
.global MLKEM_NAMESPACE(poly_frombytes_asm_clean)
.global _MLKEM_NAMESPACE(poly_frombytes_asm_clean)

out0 .req v0
out1 .req v1

in0 .req v2
in1 .req v3
in2 .req v4

dst .req x0
src .req x1
count .req x2

MLKEM_NAMESPACE(poly_frombytes_asm_clean):
_MLKEM_NAMESPACE(poly_frombytes_asm_clean):

mov count, #16
poly_frombytes_asm_clean_asm_loop_start:
ld3 {in0.8b, in1.8b, in2.8b}, [src], #24

// This is little-endian specific
// r[2 * i + 0] = t0 | (t1 << 8)
zip1 out0.16b, in0.16b, in1.16b
// r[2 * i + 0] = (t0 | (t1 << 8)) & 0xFFF
bic out0.8h, #0xF0, lsl #8

// r[2 * i + 1] = (t2 << 4), in 16-bit
ushll out1.8h, in2.8b, #4
// tmp = (t1 >> 4), in 8-bit
ushr in1.8b, in1.8b, #4
// tmp = (t1 >> 4), in 16-bit
uxtl in1.8h, in1.8b
// r[2 * i + 1] = (t1 >> 4) | (t2 << 4), in 16-bit
eor out1.16b, out1.16b, in1.16b

st2 {out0.8h, out1.8h}, [dst], #32

subs count, count, #1
cbnz count, poly_frombytes_asm_clean_asm_loop_start
ret

.unreq data0
.unreq data1
.unreq out0
.unreq out1
.unreq out2
.unreq tmp
.unreq dst
.unreq src
.unreq count

/**********************************
* poly_tomont() *
**********************************/
Expand Down
55 changes: 55 additions & 0 deletions mlkem/native/aarch64/src/poly_opt.S
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,61 @@ poly_tobytes_asm_opt_asm_loop_start:
.unreq src
.unreq count

/********************************************
* poly_frombytes() *
********************************************/
.global MLKEM_NAMESPACE(poly_frombytes_asm_opt)
.global _MLKEM_NAMESPACE(poly_frombytes_asm_opt)

out0 .req v0
out1 .req v1

in0 .req v2
in1 .req v3
in2 .req v4

dst .req x0
src .req x1
count .req x2

MLKEM_NAMESPACE(poly_frombytes_asm_opt):
_MLKEM_NAMESPACE(poly_frombytes_asm_opt):

mov count, #16
poly_frombytes_asm_opt_asm_loop_start:
ld3 {in0.8b, in1.8b, in2.8b}, [src], #24

// This is little-endian specific
// r[2 * i + 0] = t0 | (t1 << 8)
zip1 out0.16b, in0.16b, in1.16b
// r[2 * i + 0] = (t0 | (t1 << 8)) & 0xFFF
bic out0.8h, #0xF0, lsl #8

// r[2 * i + 1] = (t2 << 4), in 16-bit
ushll out1.8h, in2.8b, #4
// tmp = (t1 >> 4), in 8-bit
ushr in1.8b, in1.8b, #4
// tmp = (t1 >> 4), in 16-bit
uxtl in1.8h, in1.8b
// r[2 * i + 1] = (t1 >> 4) | (t2 << 4), in 16-bit
eor out1.16b, out1.16b, in1.16b

st2 {out0.8h, out1.8h}, [dst], #32

subs count, count, #1
cbnz count, poly_frombytes_asm_opt_asm_loop_start
ret

.unreq data0
.unreq data1
.unreq out0
.unreq out1
.unreq out2
.unreq tmp
.unreq dst
.unreq src
.unreq count

/**********************************
* poly_tomont() *
**********************************/
Expand Down

0 comments on commit 08da95f

Please sign in to comment.