From 71cc3a33e08cd3d64372d2c90956c8663d4c28a8 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 20 Jun 2024 13:07:31 +0800 Subject: [PATCH] Add PMU+PERF benchmarking code (#68) * add benchmarks using PMU cycle counter Signed-off-by: Matthias J. Kannwischer * add benchmarking script Signed-off-by: Matthias J. Kannwischer * fix warnings on MacOS Signed-off-by: Matthias J. Kannwischer * add PERF cycle counting as well Signed-off-by: Matthias J. Kannwischer * format Signed-off-by: Matthias J. Kannwischer * only print output for benchmarks Signed-off-by: Matthias J. Kannwischer --------- Signed-off-by: Matthias J. Kannwischer --- Makefile | 35 +++++++++++- scripts/tests | 53 ++++++++++++++++-- test/bench_kyber.c | 111 ++++++++++++++++++++++++++++++++++++++ test/hal.c | 131 +++++++++++++++++++++++++++++++++++++++++++++ test/hal.h | 33 ++++++++++++ 5 files changed, 357 insertions(+), 6 deletions(-) create mode 100644 test/bench_kyber.c create mode 100644 test/hal.c create mode 100644 test/hal.h diff --git a/Makefile b/Makefile index e5607c4c0..f24454db8 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,16 @@ ifeq ($(HOST_PLATFORM),Linux-x86_64) CFLAGS += -static endif +CYCLES ?= NO + +ifeq ($(CYCLES),PMU) + CFLAGS += -DPMU_CYCLES +endif + +ifeq ($(CYCLES),PERF) + CFLAGS += -DPERF_CYCLES +endif + CFLAGS_RANDOMBYTES = ${CFLAGS} ${INCLUDE_RANDOM} CFLAGS_NISTRANDOMBYTES = ${CFLAGS} ${INCLUDE_NISTRANDOM} NISTFLAGS += -Wno-unused-result -O3 -fomit-frame-pointer @@ -27,21 +37,29 @@ SOURCES = mlkem/kem.c mlkem/indcpa.c mlkem/polyvec.c mlkem/poly.c mlkem/ntt.c ml SOURCESKECCAK = $(SOURCES) fips202/keccakf1600.c fips202/fips202.c mlkem/symmetric-shake.c SOURCESKECCAKRANDOM = $(SOURCESKECCAK) randombytes/randombytes.c SOURCESNISTKATS = $(SOURCESKECCAK) test/nistrng/aes.c test/nistrng/rng.c +SOURCESBENCH = $(SOURCESKECCAKRANDOM) test/hal.c HEADERS = mlkem/params.h mlkem/kem.h mlkem/indcpa.h mlkem/polyvec.h mlkem/poly.h mlkem/ntt.h mlkem/cbd.h mlkem/reduce.h mlkem/verify.h mlkem/symmetric.h HEADERSKECCAK = $(HEADERS) fips202/keccakf1600.h fips202/fips202.h HEADERSKECCAKRANDOM = $(HEADERSKECCAK) randombytes/randombytes.h HEADERNISTKATS = $(HEADERSKECCAK) test/nistrng/aes.h test/nistrng/randombytes.h +HEADERSBENCH = $(HEADERSKECCAKRANDOM) test/hal.h + .PHONY: all mlkem kat nistkat clean -all: mlkem kat nistkat +all: mlkem bench kat nistkat mlkem: \ test/bin/test_kyber512 \ test/bin/test_kyber768 \ test/bin/test_kyber1024 +bench: \ + test/bin/bench_kyber512 \ + test/bin/bench_kyber768 \ + test/bin/bench_kyber1024 + nistkat: \ test/bin/gen_NISTKAT512 \ test/bin/gen_NISTKAT768 \ @@ -67,6 +85,21 @@ test/bin/test_kyber1024: test/test_kyber.c $(SOURCESKECCAKRANDOM) $(HEADERSKECCA $(Q)[ -d $(@D) ] || mkdir -p $(@D) $(CC) $(CFLAGS_RANDOMBYTES) -DKYBER_K=4 $(SOURCESKECCAKRANDOM) $< -o $@ +test/bin/bench_kyber512: test/bench_kyber.c $(SOURCESBENCH) $(HEADERSBENCH) + $(Q)echo " CC $@" + $(Q)[ -d $(@D) ] || mkdir -p $(@D) + $(CC) $(CFLAGS_RANDOMBYTES) -DKYBER_K=2 $(SOURCESBENCH) $< -o $@ + +test/bin/bench_kyber768: test/bench_kyber.c $(SOURCESBENCH) $(HEADERSBENCH) + $(Q)echo " CC $@" + $(Q)[ -d $(@D) ] || mkdir -p $(@D) + $(CC) $(CFLAGS_RANDOMBYTES) -DKYBER_K=3 $(SOURCESBENCH) $< -o $@ + +test/bin/bench_kyber1024: test/bench_kyber.c $(SOURCESBENCH) $(HEADERSBENCH) + $(Q)echo " CC $@" + $(Q)[ -d $(@D) ] || mkdir -p $(@D) + $(CC) $(CFLAGS_RANDOMBYTES) -DKYBER_K=4 $(SOURCESBENCH) $< -o $@ + test/bin/gen_KAT512: test/gen_KAT.c $(SOURCESKECCAKRANDOM) $(HEADERSKECCAKRANDOM) $(Q)echo " CC $@" $(Q)[ -d $(@D) ] || mkdir -p $(@D) diff --git a/scripts/tests b/scripts/tests index b3fb9220e..88eab2bd0 100755 --- a/scripts/tests +++ b/scripts/tests @@ -25,11 +25,16 @@ def sha256sum(result): return m.hexdigest() -def base_run(bin, force_qemu, verbose): +def base_run(bin, force_qemu, verbose, cycles="NO"): if force_qemu or (platform.system() == "Linux" and platform.machine() == "x86_64"): logging.debug(f"Emulating {bin} with QEMU") - args = ["make", "CROSS_PREFIX=aarch64-none-linux-gnu-", f"{bin}"] + args = [ + "make", + "CROSS_PREFIX=aarch64-none-linux-gnu-", + f"CYCLES={cycles}", + f"{bin}", + ] logging.info(" ".join(args)) p = subprocess.run( @@ -49,7 +54,7 @@ def base_run(bin, force_qemu, verbose): else: logging.debug(f"Running {bin} natively") - args = ["make", f"{bin}"] + args = ["make", f"CYCLES={cycles}", f"{bin}"] logging.info(" ".join(args)) p = subprocess.run( @@ -97,7 +102,9 @@ def parse_meta(scheme, field): return result.stdout.strip() -def test_schemes(title, scheme2file, actual_proc, expect_proc, force_qemu, verbose): +def test_schemes( + title, scheme2file, actual_proc, expect_proc, force_qemu, verbose, cycles="NO" +): logging.info(f"{title}") summary_file = os.environ.get("GITHUB_STEP_SUMMARY") @@ -117,9 +124,11 @@ def test_schemes(title, scheme2file, actual_proc, expect_proc, force_qemu, verbo return (fail, summary) fail = False + results = {} for scheme in SCHEME: bin = scheme2file(scheme) - result = base_run(bin, force_qemu, verbose) + result = base_run(bin, force_qemu, verbose, cycles) + results[scheme] = result actual = actual_proc(result) expect = expect_proc(scheme) @@ -135,6 +144,8 @@ def test_schemes(title, scheme2file, actual_proc, expect_proc, force_qemu, verbo if fail: sys.exit(1) + return results + def validate_force_qemu(ctx, _, v): if platform.system() == "Darwin" and v: @@ -254,6 +265,37 @@ def kat(force_qemu, verbose): ) +@click.command( + short_help="Run the benchmarks for all parameter sets", + context_settings={"show_default": True}, +) +@add_options(_shared_options) +@click.option( + "-c", + "--cycles", + nargs=1, + type=click.Choice(["NO", "PMU", "PERF"]), + show_default=True, + default="NO", + help="Method for counting clock cycles. PMU requires (user-space) access to the Arm Performance Monitor Unit (PMU). PERF requires a kernel with perf support.", +) +def bench(force_qemu, verbose, cycles): + config_logger(verbose) + + results = test_schemes( + "benchmark", + lambda scheme: scheme.name.replace("MLKEM", "test/bin/bench_kyber"), + lambda _: True, + lambda _: True, + force_qemu, + verbose, + cycles=cycles, + ) + for scheme, result in results.items(): + print(scheme) + print(result.decode()) + + @click.group(invoke_without_command=True) def cli(): pass @@ -263,6 +305,7 @@ cli.add_command(run) cli.add_command(func) cli.add_command(nistkat) cli.add_command(kat) +cli.add_command(bench) if __name__ == "__main__": cli() diff --git a/test/bench_kyber.c b/test/bench_kyber.c new file mode 100644 index 000000000..ae0f4747c --- /dev/null +++ b/test/bench_kyber.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: Apache-2.0 +#include +#include +#include +#include +#include +#include "kem.h" +#include "hal.h" +#include "randombytes.h" + +#define NWARMUP 50 +#define NITERERATIONS 300 +#define NTESTS 200 + +static int cmp_uint64_t(const void *a, const void *b) +{ + return (int)((*((const uint64_t *)a)) - (*((const uint64_t *)b))); +} + +static int bench(void) +{ + uint8_t pk[CRYPTO_PUBLICKEYBYTES]; + uint8_t sk[CRYPTO_SECRETKEYBYTES]; + uint8_t ct[CRYPTO_CIPHERTEXTBYTES]; + uint8_t key_a[CRYPTO_BYTES]; + uint8_t key_b[CRYPTO_BYTES]; + unsigned char kg_rand[2 * CRYPTO_BYTES], enc_rand[CRYPTO_BYTES]; + uint64_t cycles_kg[NTESTS], cycles_enc[NTESTS], cycles_dec[NTESTS]; + + unsigned int i, j; + uint64_t t0, t1; + + + for (i = 0; i < NTESTS; i++) + { + + randombytes(kg_rand, 2 * CRYPTO_BYTES); + randombytes(enc_rand, CRYPTO_BYTES); + + // Key-pair generation + for (j = 0; j < NWARMUP; j++) + { + crypto_kem_keypair_derand(pk, sk, kg_rand); + } + + t0 = get_cyclecounter(); + for (j = 0; j < NITERERATIONS; j++) + { + crypto_kem_keypair_derand(pk, sk, kg_rand); + } + t1 = get_cyclecounter(); + cycles_kg[i] = t1 - t0; + + + // Encapsulation + for (j = 0; j < NWARMUP; j++) + { + crypto_kem_enc_derand(ct, key_a, pk, enc_rand); + } + t0 = get_cyclecounter(); + for (j = 0; j < NITERERATIONS; j++) + { + crypto_kem_enc_derand(ct, key_a, pk, enc_rand); + } + t1 = get_cyclecounter(); + cycles_enc[i] = t1 - t0; + + // Decapsulation + for (j = 0; j < NWARMUP; j++) + { + crypto_kem_dec(key_b, ct, sk); + } + t0 = get_cyclecounter(); + for (j = 0; j < NITERERATIONS; j++) + { + crypto_kem_dec(key_b, ct, sk); + } + t1 = get_cyclecounter(); + cycles_dec[i] = t1 - t0; + + + if (memcmp(key_a, key_b, CRYPTO_BYTES)) + { + printf("ERROR keys\n"); + return 1; + } + } + + qsort(cycles_kg, NTESTS, sizeof(uint64_t), cmp_uint64_t); + qsort(cycles_enc, NTESTS, sizeof(uint64_t), cmp_uint64_t); + qsort(cycles_dec, NTESTS, sizeof(uint64_t), cmp_uint64_t); + + printf("keypair cycles=%"PRIu64"\n", cycles_kg[NTESTS >> 1]/NITERERATIONS); + printf("encaps cycles=%"PRIu64"\n", cycles_enc[NTESTS >> 1]/NITERERATIONS); + printf("decaps cycles=%"PRIu64"\n", cycles_dec[NTESTS >> 1]/NITERERATIONS); + + return 0; +} + +int main(void) +{ + enable_cyclecounter(); + bench(); + disable_cyclecounter(); + + printf("CRYPTO_SECRETKEYBYTES: %d\n", CRYPTO_SECRETKEYBYTES); + printf("CRYPTO_PUBLICKEYBYTES: %d\n", CRYPTO_PUBLICKEYBYTES); + printf("CRYPTO_CIPHERTEXTBYTES: %d\n", CRYPTO_CIPHERTEXTBYTES); + + return 0; +} diff --git a/test/hal.c b/test/hal.c new file mode 100644 index 000000000..c5406bd17 --- /dev/null +++ b/test/hal.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2022 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include "hal.h" + +#if defined(PMU_CYCLES) +void enable_cyclecounter(void) +{ + uint64_t tmp; + __asm __volatile ( + "mrs %[tmp], pmcr_el0\n" + "orr %[tmp], %[tmp], #1\n" + "msr pmcr_el0, %[tmp]\n" + "mrs %[tmp], pmcntenset_el0\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenset_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +void disable_cyclecounter(void) +{ + uint64_t tmp; + __asm __volatile ( + "mov %[tmp], #0x3f\n" + "orr %[tmp], %[tmp], #1<<31\n" + "msr pmcntenclr_el0, %[tmp]\n" + : [tmp] "=r" (tmp) + ); +} + +uint64_t get_cyclecounter(void) +{ + uint64_t retval; + __asm __volatile ( + "mrs %[retval], pmccntr_el0\n" + : [retval] "=r" (retval)); + return retval; +} + +#elif defined(PERF_CYCLES) + +#include +#include +#include +#include +#include +#include +#include +#include + +static int perf_fd = 0; +void enable_cyclecounter(void) +{ + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(struct perf_event_attr); + pe.config = PERF_COUNT_HW_CPU_CYCLES; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + + perf_fd = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0); + + ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0); + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); +} + +void disable_cyclecounter(void) +{ + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + close(perf_fd); +} + +uint64_t get_cyclecounter(void) +{ + long long cpu_cycles; + ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0); + ssize_t read_count = read(perf_fd, &cpu_cycles, sizeof(cpu_cycles)); + if (read_count < 0) + { + perror("read"); + exit(EXIT_FAILURE); + } + else if (read_count == 0) + { + /* Should not happen */ + printf("perf counter empty\n"); + exit(EXIT_FAILURE); + } + ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, 0); + return cpu_cycles; +} + +#else + +void enable_cyclecounter(void) +{ + return; +} +void disable_cyclecounter(void) +{ + return; +} +uint64_t get_cyclecounter(void) +{ + return (0); +} + +#endif diff --git a/test/hal.h b/test/hal.h new file mode 100644 index 000000000..754899069 --- /dev/null +++ b/test/hal.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022 Arm Limited + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#ifndef HAL_H +#define HAL_H + +#include + +void enable_cyclecounter(void); +void disable_cyclecounter(void); +uint64_t get_cyclecounter(void); + +#endif