From 87f789c47610ac8459ccd042e2f0965d3628d9f1 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Sun, 17 Nov 2024 19:09:39 +0000 Subject: [PATCH 1/2] Bench: Support PMU_CYCLES on x86_64 For benchmarking on AArch64, we offer the options PMU_CYCLES and PMU_PERF: PMU_CYCLES directly reads from PMU registers, while PMU_PERF uses the Perf kernel module. On x86_64, we so far only supported PMU_PERF. This commit adds support for PMU_CYCLES on x86_64, using the `rdtsc` instruction. The choice between x86_64 and AArch64 is automatic if PMU_CYCLES is set, based on compiler-specified architecture flags. Signed-off-by: Hanno Becker --- test/hal/hal.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/hal/hal.c b/test/hal/hal.c index 36357bdd9..8a1e99079 100644 --- a/test/hal/hal.c +++ b/test/hal/hal.c @@ -34,6 +34,25 @@ #if defined(PMU_CYCLES) +#if defined(__x86_64__) + +void enable_cyclecounter(void) {} + +void disable_cyclecounter(void) {} + +uint64_t get_cyclecounter(void) { + uint64_t result; + + __asm__ volatile("rdtsc; shlq $32,%%rdx; orq %%rdx,%%rax" + : "=a"(result) + : + : "%rdx"); + + return result; +} + +#elif defined(__AARCH64EL__) || defined(_M_ARM64) + void enable_cyclecounter(void) { uint64_t tmp; __asm __volatile( @@ -61,6 +80,10 @@ uint64_t get_cyclecounter(void) { return retval; } +#else +#error PMU_CYCLES option only supported on x86_64 and AArch64 +#endif + #elif defined(PERF_CYCLES) #include From 530a7954acad829367ba1ca457ad3e8bb2bd0487 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Sun, 17 Nov 2024 19:15:49 +0000 Subject: [PATCH 2/2] CI: Benchmark x86_64-based EC2 instances using PMU instead of PERF Previous benchmarks on x86_64-based EC2 instances indicated some accuracy in the performance measurements using the `perf` kernel module. Performance measurements using `rdtsc`, in turn, seem to work more reliably. This commit changes the benchmarking CI for x86_64 instances to use `rdtsc` to obtain the cycle counter, and no longer `perf`. To allow this the choice between PMU and PERF first needs to be made an additional parameter to the EC2 benchmarking workflow. Signed-off-by: Hanno Becker --- .github/workflows/bench.yml | 8 ++++++++ .github/workflows/bench_ec2_reusable.yml | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index c83f3cf15..407bb4afd 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -72,36 +72,43 @@ jobs: ec2_ami: ubuntu-latest (aarch64) archflags: -mcpu=cortex-a76 -march=armv8.2-a cflags: -DFORCE_AARCH64 + perf: PERF - name: Graviton3 ec2_instance_type: c7g.medium ec2_ami: ubuntu-latest (aarch64) archflags: -march=armv8.4-a+sha3 cflags: -DFORCE_AARCH64 + perf: PERF - name: Graviton4 ec2_instance_type: c8g.medium ec2_ami: ubuntu-latest (aarch64) archflags: -march=armv9-a+sha3 cflags: -DFORCE_AARCH64 + perf: PERF - name: AMD EPYC 4th gen (c7a) ec2_instance_type: c7a.medium ec2_ami: ubuntu-latest (x86_64) archflags: -mavx2 -mbmi2 -mpopcnt -maes cflags: -DFORCE_X86_64 + perf: PMU - name: Intel Xeon 4th gen (c7i) ec2_instance_type: c7i.large ec2_ami: ubuntu-latest (x86_64) archflags: -mavx2 -mbmi2 -mpopcnt -maes cflags: -DFORCE_X86_64 + perf: PMU - name: AMD EPYC 3rd gen (c6a) ec2_instance_type: c7a.medium ec2_ami: ubuntu-latest (x86_64) archflags: -mavx2 -mbmi2 -mpopcnt -maes cflags: -DFORCE_X86_64 + perf: PMU - name: Intel Xeon 3rd gen (c6i) ec2_instance_type: c7i.large ec2_ami: ubuntu-latest (x86_64) archflags: -mavx2 -mbmi2 -mpopcnt -maes cflags: -DFORCE_X86_64 + perf: PMU uses: ./.github/workflows/bench_ec2_reusable.yml if: github.repository_owner == 'pq-code-package' && (github.event.label.name == 'benchmark' || github.ref == 'refs/heads/main') with: @@ -112,4 +119,5 @@ jobs: opt: ${{ matrix.opt.value }} store_results: ${{ github.repository_owner == 'pq-code-package' && github.ref == 'refs/heads/main' }} # Only store optimized results name: "${{ matrix.target.name }}${{ (!matrix.opt.value && ' (no-opt)') || ''}}" + perf: ${{ matrix.target.perf }} secrets: inherit diff --git a/.github/workflows/bench_ec2_reusable.yml b/.github/workflows/bench_ec2_reusable.yml index e44b783a7..b822ab40a 100644 --- a/.github/workflows/bench_ec2_reusable.yml +++ b/.github/workflows/bench_ec2_reusable.yml @@ -34,6 +34,10 @@ on: type: boolean description: Runs with optimized code if enabled. default: true + perf: + type: string + description: Method by which clock cycles should be measured (PMU | PERF) + default: PERF store_results: type: boolean description: Indicates if results should be pushed to github pages @@ -121,7 +125,7 @@ jobs: cflags: ${{ inputs.cflags }} archflags: ${{ inputs.archflags }} opt: ${{ inputs.opt }} - perf: PERF + perf: ${{ inputs.perf }} store_results: ${{ inputs.store_results }} bench_extra_args: ${{ inputs.bench_extra_args }} gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} @@ -148,7 +152,7 @@ jobs: cflags: ${{ inputs.cflags }} archflags: ${{ inputs.archflags }} opt: ${{ inputs.opt }} - perf: PERF + perf: ${{ inputs.perf }} store_results: ${{ inputs.store_results }} bench_extra_args: ${{ inputs.bench_extra_args }} gh_token: ${{ secrets.AWS_GITHUB_TOKEN }}