Skip to content

Commit

Permalink
Add dilithium 257 NTT, iNTT, basemul
Browse files Browse the repository at this point in the history
* Includes addition to the parser that allows register ranges (e.g., for vldm)
  • Loading branch information
dop-amin committed Oct 7, 2024
1 parent c2179c6 commit ce534fa
Show file tree
Hide file tree
Showing 6 changed files with 961 additions and 1 deletion.
85 changes: 84 additions & 1 deletion example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1547,6 +1547,85 @@ def core(self, slothy):

slothy.optimize(start="pointwise_montgomery_acc_start", end="pointwise_montgomery_acc_end")

class fnt_257_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "fnt_257_dilithium"
infile = name
funcname = "__asm_fnt_257"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "r12"]
slothy.config.inputs_are_outputs = True

slothy.optimize(start="_fnt_0_1_2_start", end="_fnt_0_1_2_end")
slothy.optimize(start="_fnt_3_4_5_6_start", end="_fnt_3_4_5_6_end")
slothy.optimize(start="_fnt_to_16_bit_start", end="_fnt_to_16_bit_end")

class ifnt_257_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "ifnt_257_dilithium"
infile = name
funcname = "__asm_ifnt_257"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "s1", "r12"]
slothy.config.inputs_are_outputs = True

slothy.optimize(start="_ifnt_7_6_5_4_start", end="_ifnt_7_6_5_4_end")
slothy.optimize(start="_ifnt_0_1_2_start", end="_ifnt_0_1_2_end")

class basemul_257_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "basemul_257_dilithium"
infile = name
funcname = "__asm_point_mul_257_16"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "r12"]
slothy.config.inputs_are_outputs = True

slothy.optimize(start="_point_mul_16_loop_start", end="_point_mul_16_loop_end")

class basemul_257_asymmetric_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "basemul_257_asymmetric_dilithium"
infile = name
funcname = "__asm_asymmetric_mul_257_16"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r14", "r12"]
slothy.config.inputs_are_outputs = True

slothy.optimize(start="_asymmetric_mul_16_loop_start", end="_asymmetric_mul_16_loop_end")

def main():
examples = [ ExampleDilithium(),
ExampleKyber(),
Expand Down Expand Up @@ -1697,7 +1776,11 @@ def main():

intt_dilithium_123_456_78(),
pointwise_montgomery_dilithium(),
pointwise_acc_montgomery_dilithium()
pointwise_acc_montgomery_dilithium(),
fnt_257_dilithium(),
ifnt_257_dilithium(),
basemul_257_dilithium(),
basemul_257_asymmetric_dilithium()
]

all_example_names = [e.name for e in examples]
Expand Down
46 changes: 46 additions & 0 deletions examples/naive/armv7m/basemul_257_asymmetric_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// 2
.macro barrett_32 a, Qbar, Q, tmp
smmulr.w \tmp, \a, \Qbar
mls.w \a, \tmp, \Q, \a
.endm

.syntax unified
.cpu cortex-m4

.align 2
.global __asm_asymmetric_mul_257_16
.type __asm_asymmetric_mul_257_16, %function
__asm_asymmetric_mul_257_16:
push.w {r4-r11, lr}

.equ width, 4

add.w r12, r0, #256*width
_asymmetric_mul_16_loop:
_asymmetric_mul_16_loop_start:

ldr.w r7, [r1, #width]
ldr.w r4, [r1], #2*width
ldr.w r8, [r2, #width]
ldr.w r5, [r2], #2*width
ldr.w r9, [r3, #width]
ldr.w r6, [r3], #2*width

smuad r10, r4, r6
smuadx r11, r4, r5

str.w r11, [r0, #width]
str.w r10, [r0], #2*width

smuad r10, r7, r9
smuadx r11, r7, r8

str.w r11, [r0, #width]
str.w r10, [r0], #2*width

_asymmetric_mul_16_loop_end:

cmp.w r0, r12
bne.w _asymmetric_mul_16_loop

pop.w {r4-r11, pc}
61 changes: 61 additions & 0 deletions examples/naive/armv7m/basemul_257_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// 2
.macro barrett_32 a, Qbar, Q, tmp
smmulr.w \tmp, \a, \Qbar
mls.w \a, \tmp, \Q, \a
.endm

.syntax unified
.cpu cortex-m4

.align 2
.global __asm_point_mul_257_16
.type __asm_point_mul_257_16, %function
__asm_point_mul_257_16:
push.w {r4-r11, lr}

ldr.w r14, [sp, #36]

.equ width, 4

add.w r12, r14, #64*width
_point_mul_16_loop:
_point_mul_16_loop_start:

ldr.w r7, [r1, #2*width]
ldr.w r8, [r1, #3*width]
ldr.w r9, [r14, #1*width]
ldr.w r5, [r1, #1*width]
ldr.w r4, [r1], #4*width
ldr.w r6, [r14], #2*width

smultb r10, r4, r6
barrett_32 r10, r2, r3, r11
pkhbt r4, r4, r10, lsl #16

neg.w r6, r6

smultb r10, r5, r6
barrett_32 r10, r2, r3, r11
pkhbt r5, r5, r10, lsl #16

str.w r5, [r0, #1*width]
str.w r4, [r0], #2*width

smultb r10, r7, r9
barrett_32 r10, r2, r3, r11
pkhbt r7, r7, r10, lsl #16

neg.w r9, r9

smultb r10, r8, r9
barrett_32 r10, r2, r3, r11
pkhbt r8, r8, r10, lsl #16

str.w r8, [r0, #1*width]
str.w r7, [r0], #2*width

_point_mul_16_loop_end:
cmp.w r14, r12
bne.w _point_mul_16_loop

pop.w {r4-r11, pc}
Loading

0 comments on commit ce534fa

Please sign in to comment.