Skip to content

Commit

Permalink
M7: pointwise_769_asymmetric_dilithium
Browse files Browse the repository at this point in the history
  • Loading branch information
mkannwischer committed Dec 18, 2024
1 parent 8fefec1 commit ffd541f
Show file tree
Hide file tree
Showing 3 changed files with 273 additions and 0 deletions.
20 changes: 20 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -1661,6 +1661,25 @@ def core(self, slothy):
slothy.optimize_loop("_point_mul_16_loop", forced_loop_type=Arch_Armv7M.CmpLoop)


class pointwise_769_asymmetric_dilithium(Example):
def __init__(self, var="", arch=Arch_Armv7M, target=Target_CortexM7, timeout=None):
name = "pointwise_769_asymmetric_dilithium"
infile = name
funcname = "small_asymmetric_mul_asm_769"

if var != "":
name += f"_{var}"
infile += f"_{var}"
name += f"_{target_label_dict[target]}"

super().__init__(infile, name, rename=True, arch=arch, target=target, timeout=timeout, funcname=funcname)

def core(self, slothy):
slothy.config.outputs = ["r10"]
slothy.config.inputs_are_outputs = True

slothy.config.sw_pipelining.enabled = True
slothy.optimize_loop("_asymmetric_mul_16_loop")
def main():
examples = [ Example0(),
Example1(),
Expand Down Expand Up @@ -1822,6 +1841,7 @@ def main():
basemul_257_dilithium(),
basemul_257_asymmetric_dilithium(),
pointwise_769_dilithium(),
pointwise_769_asymmetric_dilithium(),
]

all_example_names = [e.name for e in examples]
Expand Down
69 changes: 69 additions & 0 deletions examples/naive/armv7m/pointwise_769_asymmetric_dilithium.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* Copyright (c) 2023 Junhao Huang ([email protected])
*
* Licensed under the Apache License, Version 2.0(the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
.syntax unified
.cpu cortex-m4
.thumb

// q locate in the top half of the register
.macro plant_red q, qa, qinv, tmp
mul \tmp, \tmp, \qinv
//tmp*qinv mod 2^2n/ 2^n; in high half
smlatt \tmp, \tmp, \q, \qa
// result in high half
.endm

#### r0: out; r1: a; r2: b; r3: bprime
.align 2
.global small_asymmetric_mul_asm_769
.type small_asymmetric_mul_asm_769, %function
small_asymmetric_mul_asm_769:
push.w {r4-r11, lr}

movw r14, #24608 // qa
movt r12, #769 // q
movw r11, #64769
movt r11, #58632 // qinv
.equ width, 4
add.w r10, r0, #256*2
_asymmetric_mul_16_loop:
ldr.w r7, [r1, #width]
ldr.w r4, [r1], #2*width
ldr.w r8, [r2, #width]
ldr.w r5, [r2], #2*width
ldr.w r9, [r3, #width]
ldr.w r6, [r3], #2*width

smuad r6, r4, r6
plant_red r12, r14, r11, r6
smuadx r5, r4, r5
plant_red r12, r14, r11, r5

pkhtb r5, r5, r6, asr #16
str.w r5, [r0], #width

smuad r6, r7, r9
plant_red r12, r14, r11, r6
smuadx r8, r7, r8
plant_red r12, r14, r11, r8

pkhtb r8, r8, r6, asr #16
str.w r8, [r0], #width
cmp.w r0, r10
bne.w _asymmetric_mul_16_loop

pop.w {r4-r11, pc}

.size small_asymmetric_mul_asm_769, .-small_asymmetric_mul_asm_769
184 changes: 184 additions & 0 deletions examples/opt/armv7m/pointwise_769_asymmetric_dilithium_opt_m7.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/**
* Copyright (c) 2023 Junhao Huang ([email protected])
*
* Licensed under the Apache License, Version 2.0(the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http:// www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
.syntax unified
.cpu cortex-m4
.thumb

// q locate in the top half of the register
.macro plant_red q, qa, qinv, tmp
mul \tmp, \tmp, \qinv
// tmp*qinv mod 2^2n/ 2^n; in high half
smlatt \tmp, \tmp, \q, \qa
// result in high half
.endm

#### r0: out; r1: a; r2: b; r3: bprime
.align 2
.global small_asymmetric_mul_asm_769_opt_m7
.type small_asymmetric_mul_asm_769_opt_m7, %function
small_asymmetric_mul_asm_769_opt_m7:
push.w {r4-r11, lr}

movw r14, #24608 // qa
movt r12, #769 // q
movw r11, #64769
movt r11, #58632 // qinv
.equ width, 4
add.w r10, r0, #256*2
// Instructions: 3
// Expected cycles: 3
// Expected IPC: 1.00
//
// Wall time: 0.01s
// User time: 0.01s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr.w r7, [r1], #2*4 // *.............................
ldr.w r9, [r2], #2*4 // .*............................
ldr.w r5, [r3], #2*4 // ..*...........................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r7, [r1], #2*4 // *..............................
// ldr.w r9, [r2], #2*4 // .*.............................
// ldr.w r5, [r3], #2*4 // ..*............................

sub r10, r10, #8
_asymmetric_mul_16_loop:
// Instructions: 23
// Expected cycles: 14
// Expected IPC: 1.64
//
// Wall time: 1.17s
// User time: 1.17s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
smuad r6, r7, r5 // *.............................
ldr r5, [r1, #-4] // .*............................
smuadx r8, r7, r9 // .*............................
ldr.w r7, [r1], #2*4 // ..e...........................
mul r6, r6, r11 // ..*...........................
ldr r4, [r3, #-4] // ...*..........................
mul r8, r8, r11 // ...*..........................
smlatt r6, r6, r12, r14 // ....*.........................
ldr r9, [r2, #-4] // ....*.........................
smlatt r8, r8, r12, r14 // .....*........................
smuad r4, r5, r4 // ......*.......................
pkhtb r6, r8, r6, asr #16 // .......*......................
smuadx r5, r5, r9 // .......*......................
ldr.w r9, [r2], #2*4 // ........e.....................
mul r4, r4, r11 // ........*.....................
mul r8, r5, r11 // .........*....................
ldr.w r5, [r3], #2*4 // .........e....................
smlatt r4, r4, r12, r14 // ..........*...................
smlatt r8, r8, r12, r14 // ...........*..................
cmp.w r0, r10 // ...........*..................
str.w r6, [r0], #4 // ............*.................
pkhtb r4, r8, r4, asr #16 // .............*................
str.w r4, [r0], #4 // .............*................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// ldr.w r7, [r1, #4] // ............'*............'~...
// ldr.w r4, [r1], #2*4 // e...........'.~...........'.~..
// ldr.w r8, [r2, #4] // ..~.........'...*.........'....
// ldr.w r5, [r2], #2*4 // ......e.....'.......~.....'....
// ldr.w r9, [r3, #4] // .~..........'..*..........'..~.
// ldr.w r6, [r3], #2*4 // .......e....'........~....'....
// smuad r6, r4, r6 // ............*.............~....
// mul r6, r6, r11 // ~...........'.*...........'.~..
// smlatt r6, r6, r12, r14 // ..~.........'...*.........'....
// smuadx r5, r4, r5 // ............'*............'~...
// mul r5, r5, r11 // .~..........'..*..........'..~.
// smlatt r5, r5, r12, r14 // ...~........'....*........'....
// pkhtb r5, r5, r6, asr #16 // .....~......'......*......'....
// str.w r5, [r0], #4 // ..........~.'...........*.'....
// smuad r6, r7, r9 // ....~.......'.....*.......'....
// mul r6, r6, r11 // ......~.....'.......*.....'....
// smlatt r6, r6, r12, r14 // ........~...'.........*...'....
// smuadx r8, r7, r8 // .....~......'......*......'....
// mul r8, r8, r11 // .......~....'........*....'....
// smlatt r8, r8, r12, r14 // .........~..'..........*..'....
// pkhtb r8, r8, r6, asr #16 // ...........~'............*'....
// str.w r8, [r0], #4 // ...........~'............*'....
// cmp.w r0, r10 // .........~..'..........*..'....

bne _asymmetric_mul_16_loop
// Instructions: 20
// Expected cycles: 14
// Expected IPC: 1.43
//
// Wall time: 0.08s
// User time: 0.08s
//
// ----- cycle (expected) ------>
// 0 25
// |------------------------|----
ldr r8, [r1, #-4] // *.............................
smuad r5, r7, r5 // *.............................
smuadx r6, r7, r9 // .*............................
ldr r9, [r3, #-4] // .*............................
ldr r7, [r2, #-4] // ..*...........................
mul r4, r5, r11 // ..*...........................
smuad r9, r8, r9 // ...*..........................
smuadx r7, r8, r7 // ....*.........................
mul r9, r9, r11 // .....*........................
mul r6, r6, r11 // ......*.......................
smlatt r4, r4, r12, r14 // .......*......................
smlatt r6, r6, r12, r14 // ........*.....................
mul r7, r7, r11 // .........*....................
pkhtb r6, r6, r4, asr #16 // ..........*...................
smlatt r9, r9, r12, r14 // ..........*...................
smlatt r7, r7, r12, r14 // ...........*..................
cmp.w r0, r10 // ............*.................
str.w r6, [r0], #4 // ............*.................
pkhtb r7, r7, r9, asr #16 // .............*................
str.w r7, [r0], #4 // .............*................

// ------ cycle (expected) ------>
// 0 25
// |------------------------|-----
// smuad r6, r7, r5 // *..............................
// ldr r5, [r1, #-4] // *..............................
// smuadx r8, r7, r9 // .*.............................
// mul r6, r6, r11 // ..*............................
// ldr r4, [r3, #-4] // .*.............................
// mul r8, r8, r11 // ......*........................
// smlatt r6, r6, r12, r14 // .......*.......................
// ldr r9, [r2, #-4] // ..*............................
// smlatt r8, r8, r12, r14 // ........*......................
// smuad r4, r5, r4 // ...*...........................
// pkhtb r6, r8, r6, asr #16 // ..........*....................
// smuadx r5, r5, r9 // ....*..........................
// mul r4, r4, r11 // .....*.........................
// mul r8, r5, r11 // .........*.....................
// smlatt r4, r4, r12, r14 // ..........*....................
// smlatt r8, r8, r12, r14 // ...........*...................
// cmp.w r0, r10 // ............*..................
// str.w r6, [r0], #4 // ............*..................
// pkhtb r4, r8, r4, asr #16 // .............*.................
// str.w r4, [r0], #4 // .............*.................


pop.w {r4-r11, pc}

.size small_asymmetric_mul_asm_769_opt_m7, .-small_asymmetric_mul_asm_769_opt_m7

0 comments on commit ffd541f

Please sign in to comment.