Skip to content

Commit

Permalink
Merge pull request #130 from slothy-optimizer/dilithium-m7
Browse files Browse the repository at this point in the history
Cortex-M7 Dilithium Part 1: Non-NTT functions
  • Loading branch information
mkannwischer authored Dec 18, 2024
2 parents 0d01c50 + c420990 commit 601e6a9
Show file tree
Hide file tree
Showing 17 changed files with 1,992 additions and 0 deletions.
177 changes: 177 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1551,6 +1551,174 @@ def core(self, slothy):
#############################################################################################


class pointwise_montgomery_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_montgomery_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_pointwise_montgomery"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "r12"]
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.enabled = True

slothy.optimize_loop("1")

class pointwise_acc_montgomery_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_acc_montgomery_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_pointwise_acc_montgomery"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r12"]
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.enabled = True

slothy.optimize_loop("1")


class basemul_257_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "basemul_257_dilithium"
infile = name
funcname = "__asm_point_mul_257_16"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):

slothy.config.outputs = ["r12", "r14"]
slothy.config.inputs_are_outputs = True

slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("_point_mul_16_loop")

class basemul_257_asymmetric_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "basemul_257_asymmetric_dilithium"
infile = name
funcname = "__asm_asymmetric_mul_257_16"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "r12"]
slothy.config.inputs_are_outputs = True

slothy.config.sw_pipelining.enabled = True
slothy.config.unsafe_address_offset_fixup = False
slothy.optimize_loop("_asymmetric_mul_16_loop")



class pointwise_769_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_769_dilithium"
infile = name
funcname = "small_pointmul_asm_769"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.inputs_are_outputs = True
slothy.config.variable_size = True

r = slothy.config.reserved_regs
r.add("r3")
slothy.config.reserved_regs = r
slothy.config.sw_pipelining.enabled = True
slothy.config.constraints.stalls_first_attempt = 16
slothy.optimize_loop("_point_mul_16_loop", forced_loop_type=Arch_Armv7M.CmpLoop)


class pointwise_769_asymmetric_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_769_asymmetric_dilithium"
infile = name
funcname = "small_asymmetric_mul_asm_769"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True

slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("_asymmetric_mul_16_loop")

class reduce32_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "reduce32_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_reduce32"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True
slothy.config.constraints.stalls_first_attempt = 4
slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("1")

class caddq_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "caddq_dilithium"
infile = name
funcname = "pqcrystals_dilithium_asm_caddq"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True
slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("1")
def main():
examples = [ Example0(),
Example1(),
Expand Down Expand Up @@ -1706,6 +1874,15 @@ def main():
fft_floatingpoint_radix4(),
# Fixed point
fft_fixedpoint_radix4(),

pointwise_montgomery_dilithium(),
pointwise_acc_montgomery_dilithium(),
basemul_257_dilithium(),
basemul_257_asymmetric_dilithium(),
pointwise_769_dilithium(),
pointwise_769_asymmetric_dilithium(),
reduce32_dilithium(),
caddq_dilithium(),
]

all_example_names = [e.name for e in examples]
Expand Down
45 changes: 45 additions & 0 deletions examples/naive/armv7m/basemul_257_asymmetric_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// 2
.macro barrett_32 a, Qbar, Q, tmp
smmulr \tmp, \a, \Qbar
mls \a, \tmp, \Q, \a
.endm

.syntax unified
.cpu cortex-m4

.align 2
.global __asm_asymmetric_mul_257_16
.type __asm_asymmetric_mul_257_16, %function
__asm_asymmetric_mul_257_16:
push.w {r4-r11, lr}

.equ width, 4

add.w r12, r0, #256*width
_asymmetric_mul_16_loop:

ldr.w r7, [r1, #width]
ldr.w r4, [r1], #2*width
ldr.w r8, [r2, #width]
ldr.w r5, [r2], #2*width
ldr.w r9, [r3, #width]
ldr.w r6, [r3], #2*width

smuad r10, r4, r6
smuadx r11, r4, r5

str.w r11, [r0, #width]
str.w r10, [r0], #2*width // @slothy:core=True

smuad r10, r7, r9
smuadx r11, r7, r8

str.w r11, [r0, #width]
str.w r10, [r0], #2*width // @slothy:core=True

cmp.w r0, r12
bne.w _asymmetric_mul_16_loop

pop.w {r4-r11, pc}

.size __asm_asymmetric_mul_257_16, .-__asm_asymmetric_mul_257_16
61 changes: 61 additions & 0 deletions examples/naive/armv7m/basemul_257_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// 2
.macro barrett_32 a, Qbar, Q, tmp
smmulr \tmp, \a, \Qbar
mls \a, \tmp, \Q, \a
.endm

.syntax unified
.cpu cortex-m4

.align 2
.global __asm_point_mul_257_16
.type __asm_point_mul_257_16, %function
__asm_point_mul_257_16:
push.w {r4-r11, lr}

ldr.w r14, [sp, #36]

.equ width, 4

add.w r12, r14, #64*width
_point_mul_16_loop:

ldr.w r7, [r1, #2*width]
ldr.w r8, [r1, #3*width]
ldr.w r9, [r14, #1*width]
ldr.w r5, [r1, #1*width]
ldr.w r4, [r1], #4*width
ldr.w r6, [r14], #2*width

smultb r10, r4, r6
barrett_32 r10, r2, r3, r11
pkhbt r4, r4, r10, lsl #16

neg r6, r6

smultb r10, r5, r6
barrett_32 r10, r2, r3, r11
pkhbt r5, r5, r10, lsl #16

str.w r5, [r0, #1*width]
str.w r4, [r0], #2*width

smultb r10, r7, r9
barrett_32 r10, r2, r3, r11
pkhbt r7, r7, r10, lsl #16

neg r9, r9

smultb r10, r8, r9
barrett_32 r10, r2, r3, r11
pkhbt r8, r8, r10, lsl #16

str.w r8, [r0, #1*width]
str.w r7, [r0], #2*width

cmp.w r14, r12
bne.w _point_mul_16_loop

pop.w {r4-r11, pc}

.size __asm_point_mul_257_16, .-__asm_point_mul_257_16
52 changes: 52 additions & 0 deletions examples/naive/armv7m/caddq_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
.syntax unified
.thumb

.macro caddq a, tmp, q
and \tmp, \q, \a, asr #31
add \a, \a, \tmp
.endm

// void asm_caddq(int32_t a[N]);
.global pqcrystals_dilithium_asm_caddq
.type pqcrystals_dilithium_asm_caddq, %function
.align 2
pqcrystals_dilithium_asm_caddq:
push {r4-r11, r14}

movw r12,#:lower16:8380417
movt r12,#:upper16:8380417

movw r10, #32
1:
ldr.w r1, [r0]
ldr.w r2, [r0, #1*4]
ldr.w r3, [r0, #2*4]
ldr.w r4, [r0, #3*4]
ldr.w r5, [r0, #4*4]
ldr.w r6, [r0, #5*4]
ldr.w r7, [r0, #6*4]
ldr.w r8, [r0, #7*4]

caddq r1, r9, r12
caddq r2, r9, r12
caddq r3, r9, r12
caddq r4, r9, r12
caddq r5, r9, r12
caddq r6, r9, r12
caddq r7, r9, r12
caddq r8, r9, r12

str.w r2, [r0, #1*4]
str.w r3, [r0, #2*4]
str.w r4, [r0, #3*4]
str.w r5, [r0, #4*4]
str.w r6, [r0, #5*4]
str.w r7, [r0, #6*4]
str.w r8, [r0, #7*4]
str r1, [r0], #8*4
subs r10, #1
bne.w 1b

pop {r4-r11, pc}

.size pqcrystals_dilithium_asm_caddq, .-pqcrystals_dilithium_asm_caddq
Loading

0 comments on commit 601e6a9

Please sign in to comment.