Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cortex-M7 Dilithium Part 1: Non-NTT functions #130

Merged
merged 8 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1551,6 +1551,174 @@ def core(self, slothy):
#############################################################################################


class pointwise_montgomery_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_montgomery_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_pointwise_montgomery"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "r12"]
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.enabled = True

slothy.optimize_loop("1")

class pointwise_acc_montgomery_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_acc_montgomery_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_pointwise_acc_montgomery"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r12"]
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.enabled = True

slothy.optimize_loop("1")


class basemul_257_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "basemul_257_dilithium"
infile = name
funcname = "__asm_point_mul_257_16"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):

slothy.config.outputs = ["r12", "r14"]
slothy.config.inputs_are_outputs = True

slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("_point_mul_16_loop")

class basemul_257_asymmetric_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "basemul_257_asymmetric_dilithium"
infile = name
funcname = "__asm_asymmetric_mul_257_16"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "r12"]
slothy.config.inputs_are_outputs = True

slothy.config.sw_pipelining.enabled = True
slothy.config.unsafe_address_offset_fixup = False
slothy.optimize_loop("_asymmetric_mul_16_loop")



class pointwise_769_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_769_dilithium"
infile = name
funcname = "small_pointmul_asm_769"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.inputs_are_outputs = True
slothy.config.variable_size = True

r = slothy.config.reserved_regs
r.add("r3")
slothy.config.reserved_regs = r
slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("_point_mul_16_loop", forced_loop_type=Arch_Armv7M.CmpLoop)


class pointwise_769_asymmetric_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_769_asymmetric_dilithium"
infile = name
funcname = "small_asymmetric_mul_asm_769"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True

slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("_asymmetric_mul_16_loop")

class reduce32_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "reduce32_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_reduce32"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True
slothy.config.constraints.stalls_first_attempt = 4
slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("1")

class caddq_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "caddq_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_caddq"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("1")
def main():
examples = [ Example0(),
Example1(),
Expand Down Expand Up @@ -1706,6 +1874,15 @@ def main():
fft_floatingpoint_radix4(),
# Fixed point
fft_fixedpoint_radix4(),

pointwise_montgomery_dilithium(),
pointwise_acc_montgomery_dilithium(),
basemul_257_dilithium(),
basemul_257_asymmetric_dilithium(),
pointwise_769_dilithium(),
pointwise_769_asymmetric_dilithium(),
reduce32_dilithium(),
caddq_dilithium(),
]

all_example_names = [e.name for e in examples]
Expand Down
45 changes: 45 additions & 0 deletions examples/naive/armv7m/basemul_257_asymmetric_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// 2
.macro barrett_32 a, Qbar, Q, tmp
smmulr \tmp, \a, \Qbar
mls \a, \tmp, \Q, \a
.endm

.syntax unified
.cpu cortex-m4

.align 2
.global __asm_asymmetric_mul_257_16
.type __asm_asymmetric_mul_257_16, %function
__asm_asymmetric_mul_257_16:
push.w {r4-r11, lr}

.equ width, 4

add.w r12, r0, #256*width
_asymmetric_mul_16_loop:

ldr.w r7, [r1, #width]
ldr.w r4, [r1], #2*width
ldr.w r8, [r2, #width]
ldr.w r5, [r2], #2*width
ldr.w r9, [r3, #width]
ldr.w r6, [r3], #2*width

smuad r10, r4, r6
smuadx r11, r4, r5

str.w r11, [r0, #width]
str.w r10, [r0], #2*width // @slothy:core=True

smuad r10, r7, r9
smuadx r11, r7, r8

str.w r11, [r0, #width]
str.w r10, [r0], #2*width // @slothy:core=True

cmp.w r0, r12
bne.w _asymmetric_mul_16_loop

pop.w {r4-r11, pc}

.size __asm_asymmetric_mul_257_16, .-__asm_asymmetric_mul_257_16
61 changes: 61 additions & 0 deletions examples/naive/armv7m/basemul_257_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// 2
.macro barrett_32 a, Qbar, Q, tmp
smmulr \tmp, \a, \Qbar
mls \a, \tmp, \Q, \a
.endm

.syntax unified
.cpu cortex-m4

.align 2
.global __asm_point_mul_257_16
.type __asm_point_mul_257_16, %function
__asm_point_mul_257_16:
push.w {r4-r11, lr}

ldr.w r14, [sp, #36]

.equ width, 4

add.w r12, r14, #64*width
_point_mul_16_loop:

ldr.w r7, [r1, #2*width]
ldr.w r8, [r1, #3*width]
ldr.w r9, [r14, #1*width]
ldr.w r5, [r1, #1*width]
ldr.w r4, [r1], #4*width
ldr.w r6, [r14], #2*width

smultb r10, r4, r6
barrett_32 r10, r2, r3, r11
pkhbt r4, r4, r10, lsl #16

neg r6, r6

smultb r10, r5, r6
barrett_32 r10, r2, r3, r11
pkhbt r5, r5, r10, lsl #16

str.w r5, [r0, #1*width]
str.w r4, [r0], #2*width

smultb r10, r7, r9
barrett_32 r10, r2, r3, r11
pkhbt r7, r7, r10, lsl #16

neg r9, r9

smultb r10, r8, r9
barrett_32 r10, r2, r3, r11
pkhbt r8, r8, r10, lsl #16

str.w r8, [r0, #1*width]
str.w r7, [r0], #2*width

cmp.w r14, r12
bne.w _point_mul_16_loop

pop.w {r4-r11, pc}

.size __asm_point_mul_257_16, .-__asm_point_mul_257_16
52 changes: 52 additions & 0 deletions examples/naive/armv7m/caddq_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
.syntax unified
.thumb

.macro caddq a, tmp, q
and \tmp, \q, \a, asr #31
add \a, \a, \tmp
.endm

// void asm_caddq(int32_t a[N]);
.global pqcrystals_dilithium_asm_caddq
.type pqcrystals_dilithium_asm_caddq, %function
.align 2
pqcrystals_dilithium_asm_caddq:
push {r4-r11, r14}

movw r12,#:lower16:8380417
movt r12,#:upper16:8380417

movw r10, #32
1:
ldr.w r1, [r0]
ldr.w r2, [r0, #1*4]
ldr.w r3, [r0, #2*4]
ldr.w r4, [r0, #3*4]
ldr.w r5, [r0, #4*4]
ldr.w r6, [r0, #5*4]
ldr.w r7, [r0, #6*4]
ldr.w r8, [r0, #7*4]

caddq r1, r9, r12
caddq r2, r9, r12
caddq r3, r9, r12
caddq r4, r9, r12
caddq r5, r9, r12
caddq r6, r9, r12
caddq r7, r9, r12
caddq r8, r9, r12

str.w r2, [r0, #1*4]
str.w r3, [r0, #2*4]
str.w r4, [r0, #3*4]
str.w r5, [r0, #4*4]
str.w r6, [r0, #5*4]
str.w r7, [r0, #6*4]
str.w r8, [r0, #7*4]
str r1, [r0], #8*4
subs r10, #1
bne.w 1b

pop {r4-r11, pc}

.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq
Loading
Loading