diff --git a/mlkem/native/aarch64/aarch64_zetas.c b/mlkem/native/aarch64/aarch64_zetas.c new file mode 100644 index 000000000..05899dff8 --- /dev/null +++ b/mlkem/native/aarch64/aarch64_zetas.c @@ -0,0 +1,130 @@ +// Copyright (c) 2024 The mlkem-native project authors +// SPDX-License-Identifier: Apache-2.0 + +// WARNING: This file is auto-generated from scripts/autogenerate_files.py +// Do not modify it directly. + +#include "arith_native_aarch64.h" + +#ifdef MLKEM_USE_NATIVE_AARCH64 + +// Table of zeta values used in the AArch64 forward NTT +// See autogenerate_files.py for details. +const int16_t aarch64_ntt_zetas_layer01234[] = { + -1600, -15749, -749, -7373, -40, -394, -687, -6762, 630, 6201, + -1432, -14095, 848, 8347, 0, 0, 1062, 10453, 296, 2914, + -882, -8682, 0, 0, -1410, -13879, 1339, 13180, 1476, 14529, + 0, 0, 193, 1900, -283, -2786, 56, 551, 0, 0, + 797, 7845, -1089, -10719, 1333, 13121, 0, 0, -543, -5345, + 1426, 14036, -1235, -12156, 0, 0, -69, -679, 535, 5266, + -447, -4400, 0, 0, 569, 5601, -936, -9213, -450, -4429, + 0, 0, -1583, -15582, -1355, -13338, 821, 8081, 0, 0, +}; + +const int16_t aarch64_ntt_zetas_layer56[] = { + 289, 289, 331, 331, -76, -76, -1573, -1573, 2845, + 2845, 3258, 3258, -748, -748, -15483, -15483, 17, 17, + 583, 583, 1637, 1637, -1041, -1041, 167, 167, 5739, + 5739, 16113, 16113, -10247, -10247, -568, -568, -680, -680, + 723, 723, 1100, 1100, -5591, -5591, -6693, -6693, 7117, + 7117, 10828, 10828, 1197, 1197, -1025, -1025, -1052, -1052, + -1274, -1274, 11782, 11782, -10089, -10089, -10355, -10355, -12540, + -12540, 1409, 1409, -48, -48, 756, 756, -314, -314, + 13869, 13869, -472, -472, 7441, 7441, -3091, -3091, -667, + -667, 233, 233, -1173, -1173, -279, -279, -6565, -6565, + 2293, 2293, -11546, -11546, -2746, -2746, 650, 650, -1352, + -1352, -816, -816, 632, 632, 6398, 6398, -13308, -13308, + -8032, -8032, 6221, 6221, -1626, -1626, -540, -540, -1482, + -1482, 1461, 1461, -16005, -16005, -5315, -5315, -14588, -14588, + 14381, 14381, 1651, 1651, -1540, -1540, 952, 952, -642, + -642, 16251, 16251, -15159, -15159, 9371, 9371, -6319, -6319, + -464, -464, 33, 33, 1320, 1320, -1414, -1414, -4567, + -4567, 325, 325, 12993, 12993, -13918, -13918, 939, 939, + -892, -892, 733, 733, 268, 268, 9243, 9243, -8780, + -8780, 7215, 7215, 2638, 2638, -1021, -1021, -941, -941, + -992, -992, 641, 641, -10050, -10050, -9262, -9262, -9764, + -9764, 6309, 6309, -1010, -1010, 1435, 1435, 807, 807, + 452, 452, -9942, -9942, 14125, 14125, 7943, 7943, 4449, + 4449, 1584, 1584, -1292, -1292, 375, 375, -1239, -1239, + 15592, 15592, -12717, -12717, 3691, 3691, -12196, -12196, -1031, + -1031, -109, -109, -780, -780, 1645, 1645, -10148, -10148, + -1073, -1073, -7678, -7678, 16192, 16192, 1438, 1438, -461, + -461, 1534, 1534, -927, -927, 14155, 14155, -4538, -4538, + 15099, 15099, -9125, -9125, 1063, 1063, -556, -556, -1230, + -1230, -863, -863, 10463, 10463, -5473, -5473, -12107, -12107, + -8495, -8495, 319, 319, 757, 757, 561, 561, -735, + -735, 3140, 3140, 7451, 7451, 5522, 5522, -7235, -7235, + -682, -682, -712, -712, 1481, 1481, 648, 648, -6713, + -6713, -7008, -7008, 14578, 14578, 6378, 6378, -525, -525, + 403, 403, 1143, 1143, -554, -554, -5168, -5168, 3967, + 3967, 11251, 11251, -5453, -5453, 1092, 1092, 1026, 1026, + -1179, -1179, 886, 886, 10749, 10749, 10099, 10099, -11605, + -11605, 8721, 8721, -855, -855, -219, -219, 1227, 1227, + 910, 910, -8416, -8416, -2156, -2156, 12078, 12078, 8957, + 8957, -1607, -1607, -1455, -1455, -1219, -1219, 885, 885, + -15818, -15818, -14322, -14322, -11999, -11999, 8711, 8711, 1212, + 1212, 1029, 1029, -394, -394, -1175, -1175, 11930, 11930, + 10129, 10129, -3878, -3878, -11566, -11566, +}; + +const int16_t aarch64_invntt_zetas_layer01234[] = { + 1583, 15582, -821, -8081, 1355, 13338, 0, 0, -569, -5601, + 450, 4429, 936, 9213, 0, 0, 69, 679, 447, 4400, + -535, -5266, 0, 0, 543, 5345, 1235, 12156, -1426, -14036, + 0, 0, -797, -7845, -1333, -13121, 1089, 10719, 0, 0, + -193, -1900, -56, -551, 283, 2786, 0, 0, 1410, 13879, + -1476, -14529, -1339, -13180, 0, 0, -1062, -10453, 882, 8682, + -296, -2914, 0, 0, 1600, 15749, 40, 394, 749, 7373, + -848, -8347, 1432, 14095, -630, -6201, 687, 6762, 0, 0, +}; + +const int16_t aarch64_invntt_zetas_layer56[] = { + -910, -910, -1227, -1227, 219, 219, 855, 855, -8957, + -8957, -12078, -12078, 2156, 2156, 8416, 8416, 1175, 1175, + 394, 394, -1029, -1029, -1212, -1212, 11566, 11566, 3878, + 3878, -10129, -10129, -11930, -11930, -885, -885, 1219, 1219, + 1455, 1455, 1607, 1607, -8711, -8711, 11999, 11999, 14322, + 14322, 15818, 15818, -648, -648, -1481, -1481, 712, 712, + 682, 682, -6378, -6378, -14578, -14578, 7008, 7008, 6713, + 6713, -886, -886, 1179, 1179, -1026, -1026, -1092, -1092, + -8721, -8721, 11605, 11605, -10099, -10099, -10749, -10749, 554, + 554, -1143, -1143, -403, -403, 525, 525, 5453, 5453, + -11251, -11251, -3967, -3967, 5168, 5168, 927, 927, -1534, + -1534, 461, 461, -1438, -1438, 9125, 9125, -15099, -15099, + 4538, 4538, -14155, -14155, 735, 735, -561, -561, -757, + -757, -319, -319, 7235, 7235, -5522, -5522, -7451, -7451, + -3140, -3140, 863, 863, 1230, 1230, 556, 556, -1063, + -1063, 8495, 8495, 12107, 12107, 5473, 5473, -10463, -10463, + -452, -452, -807, -807, -1435, -1435, 1010, 1010, -4449, + -4449, -7943, -7943, -14125, -14125, 9942, 9942, -1645, -1645, + 780, 780, 109, 109, 1031, 1031, -16192, -16192, 7678, + 7678, 1073, 1073, 10148, 10148, 1239, 1239, -375, -375, + 1292, 1292, -1584, -1584, 12196, 12196, -3691, -3691, 12717, + 12717, -15592, -15592, 1414, 1414, -1320, -1320, -33, -33, + 464, 464, 13918, 13918, -12993, -12993, -325, -325, 4567, + 4567, -641, -641, 992, 992, 941, 941, 1021, 1021, + -6309, -6309, 9764, 9764, 9262, 9262, 10050, 10050, -268, + -268, -733, -733, 892, 892, -939, -939, -2638, -2638, + -7215, -7215, 8780, 8780, -9243, -9243, -632, -632, 816, + 816, 1352, 1352, -650, -650, -6221, -6221, 8032, 8032, + 13308, 13308, -6398, -6398, 642, 642, -952, -952, 1540, + 1540, -1651, -1651, 6319, 6319, -9371, -9371, 15159, 15159, + -16251, -16251, -1461, -1461, 1482, 1482, 540, 540, 1626, + 1626, -14381, -14381, 14588, 14588, 5315, 5315, 16005, 16005, + 1274, 1274, 1052, 1052, 1025, 1025, -1197, -1197, 12540, + 12540, 10355, 10355, 10089, 10089, -11782, -11782, 279, 279, + 1173, 1173, -233, -233, 667, 667, 2746, 2746, 11546, + 11546, -2293, -2293, 6565, 6565, 314, 314, -756, -756, + 48, 48, -1409, -1409, 3091, 3091, -7441, -7441, 472, + 472, -13869, -13869, 1573, 1573, 76, 76, -331, -331, + -289, -289, 15483, 15483, 748, 748, -3258, -3258, -2845, + -2845, -1100, -1100, -723, -723, 680, 680, 568, 568, + -10828, -10828, -7117, -7117, 6693, 6693, 5591, 5591, 1041, + 1041, -1637, -1637, -583, -583, -17, -17, 10247, 10247, + -16113, -16113, -5739, -5739, -167, -167, +}; + +#else /* MLKEM_USE_NATIVE_AARCH64 */ +// Dummy declaration for compilers disliking empty compilation units +int empty_cu_aarch64_zetas; +#endif /* MLKEM_USE_NATIVE_AARCH64 */ diff --git a/mlkem/native/aarch64/arith_native_aarch64.h b/mlkem/native/aarch64/arith_native_aarch64.h index 8f0303a2c..f3f9d7f28 100644 --- a/mlkem/native/aarch64/arith_native_aarch64.h +++ b/mlkem/native/aarch64/arith_native_aarch64.h @@ -9,17 +9,30 @@ #ifdef MLKEM_USE_NATIVE_AARCH64 +#define aarch64_ntt_zetas_layer01234 \ + MLKEM_NAMESPACE(aarch64_ntt_zetas_layer01234) +#define aarch64_ntt_zetas_layer56 MLKEM_NAMESPACE(aarch64_ntt_zetas_layer56) +#define aarch64_invntt_zetas_layer01234 \ + MLKEM_NAMESPACE(aarch64_invntt_zetas_layer01234) +#define aarch64_invntt_zetas_layer56 \ + MLKEM_NAMESPACE(aarch64_invntt_zetas_layer56) + +extern const int16_t aarch64_ntt_zetas_layer01234[]; +extern const int16_t aarch64_ntt_zetas_layer56[]; +extern const int16_t aarch64_invntt_zetas_layer01234[]; +extern const int16_t aarch64_invntt_zetas_layer56[]; + #define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean) -void ntt_asm_clean(int16_t *); +void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *); #define ntt_asm_opt MLKEM_NAMESPACE(ntt_asm_opt) -void ntt_asm_opt(int16_t *); +void ntt_asm_opt(int16_t *, const int16_t *, const int16_t *); #define intt_asm_clean MLKEM_NAMESPACE(intt_asm_clean) -void intt_asm_clean(int16_t *); +void intt_asm_clean(int16_t *, const int16_t *, const int16_t *); #define intt_asm_opt MLKEM_NAMESPACE(intt_asm_opt) -void intt_asm_opt(int16_t *); +void intt_asm_opt(int16_t *, const int16_t *, const int16_t *); #define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) unsigned int rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, diff --git a/mlkem/native/aarch64/intt_123_45_67_twiddles.S b/mlkem/native/aarch64/intt_123_45_67_twiddles.S deleted file mode 100644 index 4691ee0a3..000000000 --- a/mlkem/native/aarch64/intt_123_45_67_twiddles.S +++ /dev/null @@ -1,497 +0,0 @@ -/// Copyright (c) 2024 The mlkem-native project authors -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "config.h" -#if defined(MLKEM_USE_NATIVE_AARCH64) - -roots_l56: -.short -910 -.short -910 -.short -1227 -.short -1227 -.short 219 -.short 219 -.short 855 -.short 855 -.short -8957 -.short -8957 -.short -12078 -.short -12078 -.short 2156 -.short 2156 -.short 8416 -.short 8416 -.short 1175 -.short 1175 -.short 394 -.short 394 -.short -1029 -.short -1029 -.short -1212 -.short -1212 -.short 11566 -.short 11566 -.short 3878 -.short 3878 -.short -10129 -.short -10129 -.short -11930 -.short -11930 -.short -885 -.short -885 -.short 1219 -.short 1219 -.short 1455 -.short 1455 -.short 1607 -.short 1607 -.short -8711 -.short -8711 -.short 11999 -.short 11999 -.short 14322 -.short 14322 -.short 15818 -.short 15818 -.short -648 -.short -648 -.short -1481 -.short -1481 -.short 712 -.short 712 -.short 682 -.short 682 -.short -6378 -.short -6378 -.short -14578 -.short -14578 -.short 7008 -.short 7008 -.short 6713 -.short 6713 -.short -886 -.short -886 -.short 1179 -.short 1179 -.short -1026 -.short -1026 -.short -1092 -.short -1092 -.short -8721 -.short -8721 -.short 11605 -.short 11605 -.short -10099 -.short -10099 -.short -10749 -.short -10749 -.short 554 -.short 554 -.short -1143 -.short -1143 -.short -403 -.short -403 -.short 525 -.short 525 -.short 5453 -.short 5453 -.short -11251 -.short -11251 -.short -3967 -.short -3967 -.short 5168 -.short 5168 -.short 927 -.short 927 -.short -1534 -.short -1534 -.short 461 -.short 461 -.short -1438 -.short -1438 -.short 9125 -.short 9125 -.short -15099 -.short -15099 -.short 4538 -.short 4538 -.short -14155 -.short -14155 -.short 735 -.short 735 -.short -561 -.short -561 -.short -757 -.short -757 -.short -319 -.short -319 -.short 7235 -.short 7235 -.short -5522 -.short -5522 -.short -7451 -.short -7451 -.short -3140 -.short -3140 -.short 863 -.short 863 -.short 1230 -.short 1230 -.short 556 -.short 556 -.short -1063 -.short -1063 -.short 8495 -.short 8495 -.short 12107 -.short 12107 -.short 5473 -.short 5473 -.short -10463 -.short -10463 -.short -452 -.short -452 -.short -807 -.short -807 -.short -1435 -.short -1435 -.short 1010 -.short 1010 -.short -4449 -.short -4449 -.short -7943 -.short -7943 -.short -14125 -.short -14125 -.short 9942 -.short 9942 -.short -1645 -.short -1645 -.short 780 -.short 780 -.short 109 -.short 109 -.short 1031 -.short 1031 -.short -16192 -.short -16192 -.short 7678 -.short 7678 -.short 1073 -.short 1073 -.short 10148 -.short 10148 -.short 1239 -.short 1239 -.short -375 -.short -375 -.short 1292 -.short 1292 -.short -1584 -.short -1584 -.short 12196 -.short 12196 -.short -3691 -.short -3691 -.short 12717 -.short 12717 -.short -15592 -.short -15592 -.short 1414 -.short 1414 -.short -1320 -.short -1320 -.short -33 -.short -33 -.short 464 -.short 464 -.short 13918 -.short 13918 -.short -12993 -.short -12993 -.short -325 -.short -325 -.short 4567 -.short 4567 -.short -641 -.short -641 -.short 992 -.short 992 -.short 941 -.short 941 -.short 1021 -.short 1021 -.short -6309 -.short -6309 -.short 9764 -.short 9764 -.short 9262 -.short 9262 -.short 10050 -.short 10050 -.short -268 -.short -268 -.short -733 -.short -733 -.short 892 -.short 892 -.short -939 -.short -939 -.short -2638 -.short -2638 -.short -7215 -.short -7215 -.short 8780 -.short 8780 -.short -9243 -.short -9243 -.short -632 -.short -632 -.short 816 -.short 816 -.short 1352 -.short 1352 -.short -650 -.short -650 -.short -6221 -.short -6221 -.short 8032 -.short 8032 -.short 13308 -.short 13308 -.short -6398 -.short -6398 -.short 642 -.short 642 -.short -952 -.short -952 -.short 1540 -.short 1540 -.short -1651 -.short -1651 -.short 6319 -.short 6319 -.short -9371 -.short -9371 -.short 15159 -.short 15159 -.short -16251 -.short -16251 -.short -1461 -.short -1461 -.short 1482 -.short 1482 -.short 540 -.short 540 -.short 1626 -.short 1626 -.short -14381 -.short -14381 -.short 14588 -.short 14588 -.short 5315 -.short 5315 -.short 16005 -.short 16005 -.short 1274 -.short 1274 -.short 1052 -.short 1052 -.short 1025 -.short 1025 -.short -1197 -.short -1197 -.short 12540 -.short 12540 -.short 10355 -.short 10355 -.short 10089 -.short 10089 -.short -11782 -.short -11782 -.short 279 -.short 279 -.short 1173 -.short 1173 -.short -233 -.short -233 -.short 667 -.short 667 -.short 2746 -.short 2746 -.short 11546 -.short 11546 -.short -2293 -.short -2293 -.short 6565 -.short 6565 -.short 314 -.short 314 -.short -756 -.short -756 -.short 48 -.short 48 -.short -1409 -.short -1409 -.short 3091 -.short 3091 -.short -7441 -.short -7441 -.short 472 -.short 472 -.short -13869 -.short -13869 -.short 1573 -.short 1573 -.short 76 -.short 76 -.short -331 -.short -331 -.short -289 -.short -289 -.short 15483 -.short 15483 -.short 748 -.short 748 -.short -3258 -.short -3258 -.short -2845 -.short -2845 -.short -1100 -.short -1100 -.short -723 -.short -723 -.short 680 -.short 680 -.short 568 -.short 568 -.short -10828 -.short -10828 -.short -7117 -.short -7117 -.short 6693 -.short 6693 -.short 5591 -.short 5591 -.short 1041 -.short 1041 -.short -1637 -.short -1637 -.short -583 -.short -583 -.short -17 -.short -17 -.short 10247 -.short 10247 -.short -16113 -.short -16113 -.short -5739 -.short -5739 -.short -167 -.short -167 -roots_l34: -.short 1583 -.short 15582 -.short -821 -.short -8081 -.short 1355 -.short 13338 -.short 0 -.short 0 -.short -569 -.short -5601 -.short 450 -.short 4429 -.short 936 -.short 9213 -.short 0 -.short 0 -.short 69 -.short 679 -.short 447 -.short 4400 -.short -535 -.short -5266 -.short 0 -.short 0 -.short 543 -.short 5345 -.short 1235 -.short 12156 -.short -1426 -.short -14036 -.short 0 -.short 0 -.short -797 -.short -7845 -.short -1333 -.short -13121 -.short 1089 -.short 10719 -.short 0 -.short 0 -.short -193 -.short -1900 -.short -56 -.short -551 -.short 283 -.short 2786 -.short 0 -.short 0 -.short 1410 -.short 13879 -.short -1476 -.short -14529 -.short -1339 -.short -13180 -.short 0 -.short 0 -.short -1062 -.short -10453 -.short 882 -.short 8682 -.short -296 -.short -2914 -.short 0 -.short 0 -roots_l012: -.short 1600 -.short 15749 -.short 40 -.short 394 -.short 749 -.short 7373 -.short -848 -.short -8347 -.short 1432 -.short 14095 -.short -630 -.short -6201 -.short 687 -.short 6762 -.short 0 -.short 0 - -#endif /* MLKEM_USE_NATIVE_AARCH64 */ diff --git a/mlkem/native/aarch64/intt_clean.S b/mlkem/native/aarch64/intt_clean.S index cbb261100..2ac4e0f15 100644 --- a/mlkem/native/aarch64/intt_clean.S +++ b/mlkem/native/aarch64/intt_clean.S @@ -76,22 +76,22 @@ mls \a\().8h, t0.8h, consts.h[0] .endm -.macro load_roots_123 - ldr q_root0, [r_ptr0], #32 - ldr q_root1, [r_ptr0, #-16] +.macro load_roots_012 + ldr q_root0, [r01234_ptr], #32 + ldr q_root1, [r01234_ptr, #-16] .endm -.macro load_next_roots_45 - ldr q_root0, [r_ptr0], #16 +.macro load_next_roots_34 + ldr q_root0, [r01234_ptr], #16 .endm -.macro load_next_roots_67 - ldr q_root0, [r_ptr1], #(6*16) - ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] - ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] - ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] - ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] - ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.macro load_next_roots_56 + ldr q_root0, [r56_ptr], #(6*16) + ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] + ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] + ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] + ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] + ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -143,10 +143,6 @@ // are NOT canonically reduced. The ordering of the coefficients is canonical, // also matching PQClean. -.data -.p2align 4 -roots: -#include "intt_123_45_67_twiddles.S" .text .global MLKEM_NAMESPACE(intt_asm_clean) @@ -182,11 +178,12 @@ MLKEM_NAMESPACE(intt_asm_clean): _MLKEM_NAMESPACE(intt_asm_clean): push_stack - in .req x0 - inp .req x1 - count .req x2 - r_ptr0 .req x3 - r_ptr1 .req x4 + in .req x0 + r01234_ptr .req x1 + r56_ptr .req x2 + + inp .req x3 + count .req x4 xtmp .req x5 data0 .req v8 @@ -230,9 +227,6 @@ _MLKEM_NAMESPACE(intt_asm_clean): t2 .req v27 t3 .req v28 - ASM_LOAD(r_ptr0, roots_l34) - ASM_LOAD(r_ptr1, roots_l56) - ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] @@ -269,7 +263,7 @@ scale_start: mov count, #8 .p2align 2 -layer4567_start: +layer3456_start: ldr q_data0, [inp, #(16*0)] ldr q_data1, [inp, #(16*1)] @@ -278,7 +272,7 @@ layer4567_start: transpose4 data // manual ld4 - load_next_roots_67 + load_next_roots_56 // Layer 7 gs_butterfly_v data0, data1, root1, root1_tw @@ -297,7 +291,7 @@ layer4567_start: transpose4 data - load_next_roots_45 + load_next_roots_34 // Layer 5 gs_butterfly data0, data1, root0, 2, 3 @@ -324,17 +318,16 @@ layer4567_start: str q_data3, [inp, #(-64 + 16*3)] subs count, count, #1 - cbnz count, layer4567_start + cbnz count, layer3456_start // --------------------------------------------------------------------- mov count, #4 - ASM_LOAD(r_ptr0, roots_l012) - load_roots_123 + load_roots_012 .p2align 2 -layer123_start: +layer012_start: ldr q_data0, [in, #0] ldr q_data1, [in, #(1*(512/8))] @@ -373,7 +366,7 @@ layer123_start: str q_data3, [in, #(-16 + 3*(512/8))] subs count, count, #1 - cbnz count, layer123_start + cbnz count, layer012_start pop_stack ret diff --git a/mlkem/native/aarch64/intt_opt.S b/mlkem/native/aarch64/intt_opt.S index 59b11c5c8..48c412f36 100644 --- a/mlkem/native/aarch64/intt_opt.S +++ b/mlkem/native/aarch64/intt_opt.S @@ -35,6 +35,11 @@ // // See mlken/reduce.c and test/test_bounds.py for more details. .macro mulmodq dst, src, const, idx0, idx1 + // Signed barrett multiplication using + // round-to-nearest-even-integer approximation. + // Following https://eprint.iacr.org/2021/986.pdf, this + // is functionally the same as a signed Montgomery multiplication + // with a suitable constant of absolute value < q. sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] mls \dst\().8h, t2.8h, consts.h[0] @@ -71,22 +76,22 @@ mls \a\().8h, t0.8h, consts.h[0] .endm -.macro load_roots_123 - ldr q_root0, [r_ptr0], #32 - ldr q_root1, [r_ptr0, #-16] +.macro load_roots_012 + ldr q_root0, [r01234_ptr], #32 + ldr q_root1, [r01234_ptr, #-16] .endm -.macro load_next_roots_45 - ldr q_root0, [r_ptr0], #16 +.macro load_next_roots_34 + ldr q_root0, [r01234_ptr], #16 .endm -.macro load_next_roots_67 - ldr q_root0, [r_ptr1], #(6*16) - ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] - ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] - ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] - ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] - ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.macro load_next_roots_56 + ldr q_root0, [r56_ptr], #(6*16) + ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] + ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] + ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] + ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] + ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -138,11 +143,8 @@ // are NOT canonically reduced. The ordering of the coefficients is canonical, // also matching PQClean. -.data -.p2align 4 -roots: -#include "intt_123_45_67_twiddles.S" .text + .global MLKEM_NAMESPACE(intt_asm_opt) .global _MLKEM_NAMESPACE(intt_asm_opt) @@ -176,11 +178,12 @@ MLKEM_NAMESPACE(intt_asm_opt): _MLKEM_NAMESPACE(intt_asm_opt): push_stack - in .req x0 - inp .req x1 - count .req x2 - r_ptr0 .req x3 - r_ptr1 .req x4 + in .req x0 + r01234_ptr .req x1 + r56_ptr .req x2 + + inp .req x3 + count .req x4 xtmp .req x5 data0 .req v8 @@ -224,9 +227,6 @@ _MLKEM_NAMESPACE(intt_asm_opt): t2 .req v27 t3 .req v28 - ASM_LOAD(r_ptr0, roots_l34) - ASM_LOAD(r_ptr1, roots_l56) - ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] @@ -241,191 +241,70 @@ _MLKEM_NAMESPACE(intt_asm_opt): mov inp, in mov count, #8 - // Instructions: 3 - // Expected cycles: 5 - // Expected IPC: 0.60 - // - // Cycle bound: 5.0 - // IPC bound: 0.60 - // - // Wall time: 0.00s - // User time: 0.00s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q12, [x1, #16] // *............................. - ldr q31, [x1, #32] // ..*........................... - ldr q6, [x1, #48] // ....*......................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q12, [x1, #16] // *.............................. - // ldr q31, [x1, #32] // ..*............................ - // ldr q6, [x1, #48] // ....*.......................... - - sub count, count, #1 scale_start: - // Instructions: 20 - // Expected cycles: 24 - // Expected IPC: 0.83 - // - // Cycle bound: 24.0 - // IPC bound: 0.83 - // - // Wall time: 0.51s - // User time: 0.51s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q10, [x1, #0] // *............................. - sqrdmulh v13.8H, v12.8H, v30.8H // ..*........................... - mul v21.8H, v12.8H, v29.8H // ...*.......................... - sqrdmulh v3.8H, v10.8H, v30.8H // ....*......................... - mul v10.8H, v10.8H, v29.8H // .....*........................ - sqrdmulh v12.8H, v31.8H, v30.8H // ......*....................... - mls v21.8H, v13.8H, v7.H[0] // .......*...................... - mul v13.8H, v31.8H, v29.8H // ........*..................... - mls v10.8H, v3.8H, v7.H[0] // .........*.................... - sqrdmulh v3.8H, v6.8H, v30.8H // ..........*................... - mul v6.8H, v6.8H, v29.8H // ...........*.................. - mls v13.8H, v12.8H, v7.H[0] // ............*................. - str q10, [x1], #64 // .............*................ - ldr q12, [x1, #16] // ..............e............... - str q21, [x1, #-48] // ................*............. - mls v6.8H, v3.8H, v7.H[0] // .................*............ - str q13, [x1, #-32] // ..................*........... - ldr q31, [x1, #32] // ...................e.......... - str q6, [x1, #-16] // .....................*........ - ldr q6, [x1, #48] // ......................e....... - // ------ cycle (expected) -------> - // 0 25 - // |------------------------|------ - // ldr q8, [x1, #(16*0)] // ..........*..................... - // ldr q9, [x1, #(16*1)] // e.........'.............~....... - // ldr q10, [x1, #(16*2)] // .....e....'..................~.. - // ldr q11, [x1, #(16*3)] // ........e.'..................... - // sqrdmulh v27.8h, v8.8h, v30.8h // ..........'...*................. - // mul v8.8h, v8.8h, v29.8h // ..........'....*................ - // mls v8.8h, v27.8h, v7.h[0] // ..........'........*............ - // sqrdmulh v27.8h, v9.8h, v30.8h // ..........'.*................... - // mul v9.8h, v9.8h, v29.8h // ..........'..*.................. - // mls v9.8h, v27.8h, v7.h[0] // ..........'......*.............. - // sqrdmulh v27.8h, v10.8h, v30.8h // ..........'.....*............... - // mul v10.8h, v10.8h, v29.8h // ..........'.......*............. - // mls v10.8h, v27.8h, v7.h[0] // ..........'...........*......... - // sqrdmulh v27.8h, v11.8h, v30.8h // ..........'.........*........... - // mul v11.8h, v11.8h, v29.8h // ..........'..........*.......... - // mls v11.8h, v27.8h, v7.h[0] // ...~......'................*.... - // str q8, [x1], #64 // ..........'............*........ - // str q9, [x1, #(-64 + 16*1)] // ..~.......'...............*..... - // str q10, [x1, #(-64 + 16*2)] // ....~.....'.................*... - // str q11, [x1, #(-64 + 16*3)] // .......~..'....................* + ldr q_data0, [inp, #(16*0)] + ldr q_data1, [inp, #(16*1)] + ldr q_data2, [inp, #(16*2)] + ldr q_data3, [inp, #(16*3)] - sub count, count, #1 - cbnz count, scale_start - // Instructions: 17 - // Expected cycles: 20 - // Expected IPC: 0.85 - // - // Cycle bound: 20.0 - // IPC bound: 0.85 - // - // Wall time: 0.16s - // User time: 0.16s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - sqrdmulh v0.8H, v6.8H, v30.8H // *............................. - mul v13.8H, v6.8H, v29.8H // .*............................ - ldr q10, [x1, #0] // ..*........................... - mul v23.8H, v31.8H, v29.8H // ....*......................... - mls v13.8H, v0.8H, v7.H[0] // .....*........................ - sqrdmulh v4.8H, v10.8H, v30.8H // ......*....................... - mul v18.8H, v10.8H, v29.8H // .......*...................... - sqrdmulh v10.8H, v12.8H, v30.8H // ........*..................... - mul v21.8H, v12.8H, v29.8H // .........*.................... - sqrdmulh v0.8H, v31.8H, v30.8H // ..........*................... - mls v18.8H, v4.8H, v7.H[0] // ...........*.................. - str q13, [x1, #48] // ............*................. - mls v21.8H, v10.8H, v7.H[0] // .............*................ - mls v23.8H, v0.8H, v7.H[0] // ..............*............... - str q18, [x1], #64 // ...............*.............. - str q21, [x1, #-48] // .................*............ - str q23, [x1, #-32] // ...................*.......... + mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 + // Bounds: Absolute value < q - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q10, [x1, #0] // ..*............................ - // sqrdmulh v13.8H, v12.8H, v30.8H // ........*...................... - // mul v21.8H, v12.8H, v29.8H // .........*..................... - // sqrdmulh v3.8H, v10.8H, v30.8H // ......*........................ - // mul v10.8H, v10.8H, v29.8H // .......*....................... - // sqrdmulh v12.8H, v31.8H, v30.8H // ..........*.................... - // mls v21.8H, v13.8H, v7.H[0] // .............*................. - // mul v13.8H, v31.8H, v29.8H // ....*.......................... - // mls v10.8H, v3.8H, v7.H[0] // ...........*................... - // sqrdmulh v3.8H, v6.8H, v30.8H // *.............................. - // mul v6.8H, v6.8H, v29.8H // .*............................. - // mls v13.8H, v12.8H, v7.H[0] // ..............*................ - // str q10, [x1], #64 // ...............*............... - // str q21, [x1, #-48] // .................*............. - // mls v6.8H, v3.8H, v7.H[0] // .....*......................... - // str q13, [x1, #-32] // ...................*........... - // str q6, [x1, #-16] // ............*.................. + str q_data0, [inp], #64 + str q_data1, [inp, #(-64 + 16*1)] + str q_data2, [inp, #(-64 + 16*2)] + str q_data3, [inp, #(-64 + 16*3)] + subs count, count, #1 + cbnz count, scale_start mov inp, in mov count, #8 .p2align 2 - // Instructions: 11 - // Expected cycles: 20 - // Expected IPC: 0.55 - // - // Cycle bound: 20.0 - // IPC bound: 0.55 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q10, [x1, #0] // *............................. - ldr q21, [x1, #16] // ..*........................... - ldr q31, [x1, #32] // ....*......................... - ldr q12, [x1, #48] // ......*....................... - ldr q5, [x4], #(6*16) // ........*..................... - trn1 v30.4S, v31.4S, v12.4S // ..........*................... - ldr q9, [x4, #-80] // ...........*.................. - ldr q15, [x4, #-64] // .............*................ - ldr q6, [x4, #-48] // ...............*.............. - ldr q25, [x4, #-32] // .................*............ - ldr q20, [x4, #-16] // ...................*.......... + // Instructions: 11 + // Expected cycles: 20 + // Expected IPC: 0.55 + // + // Cycle bound: 20.0 + // IPC bound: 0.55 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q26, [x3, #0] // *............................. + ldr q8, [x3, #16] // ..*........................... + ldr q24, [x3, #32] // ....*......................... + ldr q16, [x3, #48] // ......*....................... + ldr q9, [x2], #(6*16) // ........*..................... + trn1 v0.4S, v24.4S, v16.4S // ..........*................... + ldr q6, [x2, #-80] // ...........*.................. + ldr q3, [x2, #-64] // .............*................ + ldr q15, [x2, #-48] // ...............*.............. + ldr q4, [x2, #-32] // .................*............ + ldr q28, [x2, #-16] // ...................*.......... - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q10, [x1, #0] // *.............................. - // ldr q21, [x1, #16] // ..*............................ - // ldr q31, [x1, #32] // ....*.......................... - // ldr q12, [x1, #48] // ......*........................ - // trn1 v30.4S, v31.4S, v12.4S // ..........*.................... - // ldr q5, [x4], #(6*16) // ........*...................... - // ldr q9, [x4, #-80] // ...........*................... - // ldr q15, [x4, #-64] // .............*................. - // ldr q6, [x4, #-48] // ...............*............... - // ldr q25, [x4, #-32] // .................*............. - // ldr q20, [x4, #-16] // ...................*........... + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q26, [x3, #0] // *.............................. + // ldr q8, [x3, #16] // ..*............................ + // ldr q24, [x3, #32] // ....*.......................... + // ldr q16, [x3, #48] // ......*........................ + // trn1 v0.4S, v24.4S, v16.4S // ..........*.................... + // ldr q9, [x2], #(6*16) // ........*...................... + // ldr q6, [x2, #-80] // ...........*................... + // ldr q3, [x2, #-64] // .............*................. + // ldr q15, [x2, #-48] // ...............*............... + // ldr q4, [x2, #-32] // .................*............. + // ldr q28, [x2, #-16] // ...................*........... sub count, count, #1 -layer4567_start: +layer3456_start: // Instructions: 83 // Expected cycles: 94 // Expected IPC: 0.88 @@ -439,97 +318,97 @@ layer4567_start: // ------------------------------------- cycle (expected) --------------------------------------> // 0 25 50 75 // |------------------------|------------------------|------------------------|------------------ - trn1 v13.4S, v10.4S, v21.4S // *............................................................................................. - trn2 v10.4S, v10.4S, v21.4S // .*............................................................................................ - trn2 v21.4S, v31.4S, v12.4S // ..*........................................................................................... - trn2 v3.2D, v13.2D, v30.2D // ...*.......................................................................................... - trn1 v13.2D, v13.2D, v30.2D // ....*......................................................................................... - trn2 v12.2D, v10.2D, v21.2D // .....*........................................................................................ - trn1 v10.2D, v10.2D, v21.2D // ......*....................................................................................... - sub v21.8H, v3.8H, v12.8H // .......*...................................................................................... - add v3.8H, v3.8H, v12.8H // ........*..................................................................................... - sub v12.8H, v13.8H, v10.8H // .........*.................................................................................... - add v13.8H, v13.8H, v10.8H // ..........*................................................................................... - sqrdmulh v10.8H, v21.8H, v20.8H // ...........*.................................................................................. - sqrdmulh v6.8H, v12.8H, v6.8H // ............*................................................................................. - mul v12.8H, v12.8H, v15.8H // .............*................................................................................ - mul v21.8H, v21.8H, v25.8H // ..............*............................................................................... - sub v30.8H, v13.8H, v3.8H // ...............*.............................................................................. - add v13.8H, v13.8H, v3.8H // ................*............................................................................. - mls v12.8H, v6.8H, v7.H[0] // .................*............................................................................ - mls v21.8H, v10.8H, v7.H[0] // ..................*........................................................................... - sqrdmulh v10.8H, v30.8H, v9.8H // ...................*.......................................................................... - mul v3.8H, v30.8H, v5.8H // ....................*......................................................................... - ldr q6, [x3], #16 // .....................*........................................................................ - sub v30.8H, v12.8H, v21.8H // .......................*...................................................................... - mls v3.8H, v10.8H, v7.H[0] // ........................*..................................................................... - add v10.8H, v12.8H, v21.8H // .........................*.................................................................... - sqrdmulh v21.8H, v30.8H, v9.8H // ..........................*................................................................... - mul v12.8H, v30.8H, v5.8H // ...........................*.................................................................. - trn1 v30.4S, v13.4S, v10.4S // ............................*................................................................. - trn2 v13.4S, v13.4S, v10.4S // .............................*................................................................ - ldr q10, [x1, #64] // ..............................e............................................................... - mls v12.8H, v21.8H, v7.H[0] // ................................*............................................................. - ldr q21, [x1, #80] // .................................e............................................................ - ldr q31, [x1, #96] // ...................................e.......................................................... - trn1 v5.4S, v3.4S, v12.4S // .....................................*........................................................ - trn2 v3.4S, v3.4S, v12.4S // ......................................*....................................................... - ldr q12, [x1, #112] // .......................................e...................................................... - trn2 v9.2D, v30.2D, v5.2D // .........................................*.................................................... - trn2 v15.2D, v13.2D, v3.2D // ..........................................*................................................... - trn1 v30.2D, v30.2D, v5.2D // ...........................................*.................................................. - trn1 v13.2D, v13.2D, v3.2D // ............................................*................................................. - sub v3.8H, v9.8H, v15.8H // .............................................*................................................ - sub v5.8H, v30.8H, v13.8H // ..............................................*............................................... - add v13.8H, v30.8H, v13.8H // ...............................................*.............................................. - sqrdmulh v30.8H, v3.8H, v6.H[5] // ................................................*............................................. - sqrdmulh v25.8H, v5.8H, v6.H[3] // .................................................*............................................ - mul v5.8H, v5.8H, v6.H[2] // ..................................................*........................................... - mul v3.8H, v3.8H, v6.H[4] // ...................................................*.......................................... - add v9.8H, v9.8H, v15.8H // ....................................................*......................................... - sqdmulh v15.8H, v13.8H, v7.H[1] // .....................................................*........................................ - mls v5.8H, v25.8H, v7.H[0] // ......................................................*....................................... - mls v3.8H, v30.8H, v7.H[0] // .......................................................*...................................... - sqdmulh v30.8H, v9.8H, v7.H[1] // ........................................................*..................................... - srshr v15.8H, v15.8H, #11 // .........................................................*.................................... - sqdmulh v25.8H, v5.8H, v7.H[1] // ..........................................................*................................... - sqdmulh v20.8H, v3.8H, v7.H[1] // ...........................................................*.................................. - mls v13.8H, v15.8H, v7.H[0] // ............................................................*................................. - srshr v30.8H, v30.8H, #11 // .............................................................*................................ - srshr v15.8H, v25.8H, #11 // ..............................................................*............................... - srshr v25.8H, v20.8H, #11 // ...............................................................*.............................. - mls v9.8H, v30.8H, v7.H[0] // ................................................................*............................. - mls v5.8H, v15.8H, v7.H[0] // .................................................................*............................ - mls v3.8H, v25.8H, v7.H[0] // ..................................................................*........................... - trn1 v30.4S, v31.4S, v12.4S // ...................................................................e.......................... - sub v15.8H, v13.8H, v9.8H // ....................................................................*......................... - add v13.8H, v13.8H, v9.8H // .....................................................................*........................ - sub v9.8H, v5.8H, v3.8H // ......................................................................*....................... - sqrdmulh v25.8H, v15.8H, v6.H[1] // .......................................................................*...................... - mul v15.8H, v15.8H, v6.H[0] // ........................................................................*..................... - sqrdmulh v20.8H, v9.8H, v6.H[1] // .........................................................................*.................... - mul v6.8H, v9.8H, v6.H[0] // ..........................................................................*................... - add v3.8H, v5.8H, v3.8H // ...........................................................................*.................. - mls v15.8H, v25.8H, v7.H[0] // ............................................................................*................. - str q13, [x1], #(64) // .............................................................................*................ - mls v6.8H, v20.8H, v7.H[0] // ..............................................................................*............... - str q3, [x1, #-48] // ...............................................................................*.............. - ldr q5, [x4], #(6*16) // ................................................................................e............. - str q15, [x1, #-32] // ..................................................................................*........... - ldr q9, [x4, #-80] // ...................................................................................e.......... - str q6, [x1, #-16] // .....................................................................................*........ - ldr q15, [x4, #-64] // ......................................................................................e....... - ldr q6, [x4, #-48] // ........................................................................................e..... - ldr q25, [x4, #-32] // ..........................................................................................e... - ldr q20, [x4, #-16] // ............................................................................................e. + trn1 v12.4S, v26.4S, v8.4S // *............................................................................................. + trn2 v26.4S, v26.4S, v8.4S // .*............................................................................................ + trn2 v8.4S, v24.4S, v16.4S // ..*........................................................................................... + trn2 v11.2D, v12.2D, v0.2D // ...*.......................................................................................... + trn1 v12.2D, v12.2D, v0.2D // ....*......................................................................................... + trn2 v16.2D, v26.2D, v8.2D // .....*........................................................................................ + trn1 v26.2D, v26.2D, v8.2D // ......*....................................................................................... + sub v8.8H, v11.8H, v16.8H // .......*...................................................................................... + add v11.8H, v11.8H, v16.8H // ........*..................................................................................... + sub v16.8H, v12.8H, v26.8H // .........*.................................................................................... + add v12.8H, v12.8H, v26.8H // ..........*................................................................................... + sqrdmulh v26.8H, v8.8H, v28.8H // ...........*.................................................................................. + sqrdmulh v15.8H, v16.8H, v15.8H // ............*................................................................................. + mul v16.8H, v16.8H, v3.8H // .............*................................................................................ + mul v8.8H, v8.8H, v4.8H // ..............*............................................................................... + sub v0.8H, v12.8H, v11.8H // ...............*.............................................................................. + add v12.8H, v12.8H, v11.8H // ................*............................................................................. + mls v16.8H, v15.8H, v7.H[0] // .................*............................................................................ + mls v8.8H, v26.8H, v7.H[0] // ..................*........................................................................... + sqrdmulh v26.8H, v0.8H, v6.8H // ...................*.......................................................................... + mul v11.8H, v0.8H, v9.8H // ....................*......................................................................... + ldr q15, [x1], #16 // .....................*........................................................................ + sub v0.8H, v16.8H, v8.8H // .......................*...................................................................... + mls v11.8H, v26.8H, v7.H[0] // ........................*..................................................................... + add v26.8H, v16.8H, v8.8H // .........................*.................................................................... + sqrdmulh v8.8H, v0.8H, v6.8H // ..........................*................................................................... + mul v16.8H, v0.8H, v9.8H // ...........................*.................................................................. + trn1 v0.4S, v12.4S, v26.4S // ............................*................................................................. + trn2 v12.4S, v12.4S, v26.4S // .............................*................................................................ + ldr q26, [x3, #64] // ..............................e............................................................... + mls v16.8H, v8.8H, v7.H[0] // ................................*............................................................. + ldr q8, [x3, #80] // .................................e............................................................ + ldr q24, [x3, #96] // ...................................e.......................................................... + trn1 v9.4S, v11.4S, v16.4S // .....................................*........................................................ + trn2 v11.4S, v11.4S, v16.4S // ......................................*....................................................... + ldr q16, [x3, #112] // .......................................e...................................................... + trn2 v6.2D, v0.2D, v9.2D // .........................................*.................................................... + trn2 v3.2D, v12.2D, v11.2D // ..........................................*................................................... + trn1 v0.2D, v0.2D, v9.2D // ...........................................*.................................................. + trn1 v12.2D, v12.2D, v11.2D // ............................................*................................................. + sub v11.8H, v6.8H, v3.8H // .............................................*................................................ + sub v9.8H, v0.8H, v12.8H // ..............................................*............................................... + add v12.8H, v0.8H, v12.8H // ...............................................*.............................................. + sqrdmulh v0.8H, v11.8H, v15.H[5] // ................................................*............................................. + sqrdmulh v4.8H, v9.8H, v15.H[3] // .................................................*............................................ + mul v9.8H, v9.8H, v15.H[2] // ..................................................*........................................... + mul v11.8H, v11.8H, v15.H[4] // ...................................................*.......................................... + add v6.8H, v6.8H, v3.8H // ....................................................*......................................... + sqdmulh v3.8H, v12.8H, v7.H[1] // .....................................................*........................................ + mls v9.8H, v4.8H, v7.H[0] // ......................................................*....................................... + mls v11.8H, v0.8H, v7.H[0] // .......................................................*...................................... + sqdmulh v0.8H, v6.8H, v7.H[1] // ........................................................*..................................... + srshr v3.8H, v3.8H, #11 // .........................................................*.................................... + sqdmulh v4.8H, v9.8H, v7.H[1] // ..........................................................*................................... + sqdmulh v28.8H, v11.8H, v7.H[1] // ...........................................................*.................................. + mls v12.8H, v3.8H, v7.H[0] // ............................................................*................................. + srshr v0.8H, v0.8H, #11 // .............................................................*................................ + srshr v3.8H, v4.8H, #11 // ..............................................................*............................... + srshr v4.8H, v28.8H, #11 // ...............................................................*.............................. + mls v6.8H, v0.8H, v7.H[0] // ................................................................*............................. + mls v9.8H, v3.8H, v7.H[0] // .................................................................*............................ + mls v11.8H, v4.8H, v7.H[0] // ..................................................................*........................... + trn1 v0.4S, v24.4S, v16.4S // ...................................................................e.......................... + sub v3.8H, v12.8H, v6.8H // ....................................................................*......................... + add v12.8H, v12.8H, v6.8H // .....................................................................*........................ + sub v6.8H, v9.8H, v11.8H // ......................................................................*....................... + sqrdmulh v4.8H, v3.8H, v15.H[1] // .......................................................................*...................... + mul v3.8H, v3.8H, v15.H[0] // ........................................................................*..................... + sqrdmulh v28.8H, v6.8H, v15.H[1] // .........................................................................*.................... + mul v15.8H, v6.8H, v15.H[0] // ..........................................................................*................... + add v11.8H, v9.8H, v11.8H // ...........................................................................*.................. + mls v3.8H, v4.8H, v7.H[0] // ............................................................................*................. + str q12, [x3], #(64) // .............................................................................*................ + mls v15.8H, v28.8H, v7.H[0] // ..............................................................................*............... + str q11, [x3, #-48] // ...............................................................................*.............. + ldr q9, [x2], #(6*16) // ................................................................................e............. + str q3, [x3, #-32] // ..................................................................................*........... + ldr q6, [x2, #-80] // ...................................................................................e.......... + str q15, [x3, #-16] // .....................................................................................*........ + ldr q3, [x2, #-64] // ......................................................................................e....... + ldr q15, [x2, #-48] // ........................................................................................e..... + ldr q4, [x2, #-32] // ..........................................................................................e... + ldr q28, [x2, #-16] // ............................................................................................e. // ----------------------------------------------------------------- cycle (expected) ------------------------------------------------------------------> // 0 25 50 75 100 125 // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------ - // ldr q8, [x1, #(16*0)] // e...............................................................'.............................~....................................................... - // ldr q9, [x1, #(16*1)] // ...e............................................................'................................~.................................................... - // ldr q10, [x1, #(16*2)] // .....e..........................................................'..................................~.................................................. - // ldr q11, [x1, #(16*3)] // .........e......................................................'......................................~.............................................. + // ldr q8, [x3, #(16*0)] // e...............................................................'.............................~....................................................... + // ldr q9, [x3, #(16*1)] // ...e............................................................'................................~.................................................... + // ldr q10, [x3, #(16*2)] // .....e..........................................................'..................................~.................................................. + // ldr q11, [x3, #(16*3)] // .........e......................................................'......................................~.............................................. // trn1 v25.4s, v8.4s, v9.4s // ................................................................*..................................................................................... // trn2 v26.4s, v8.4s, v9.4s // ................................................................'*.................................................................................... // trn1 v27.4s, v10.4s, v11.4s // .....................................e..........................'..................................................................~.................. @@ -538,12 +417,12 @@ layer4567_start: // trn2 v11.2d, v26.2d, v28.2d // ................................................................'....*................................................................................ // trn1 v8.2d, v25.2d, v27.2d // ................................................................'...*................................................................................. // trn1 v9.2d, v26.2d, v28.2d // ................................................................'.....*............................................................................... - // ldr q0, [x4], #(6*16) // ..................................................e.............'...............................................................................~..... - // ldr q4, [x4, #(-6*16 + 1*16)] // .....................................................e..........'..................................................................................~.. - // ldr q1, [x4, #(-6*16 + 2*16)] // ........................................................e.......'..................................................................................... - // ldr q5, [x4, #(-6*16 + 3*16)] // ..........................................................e.....'..................................................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ............................................................e...'..................................................................................... - // ldr q6, [x4, #(-6*16 + 5*16)] // ..............................................................e.'..................................................................................... + // ldr q0, [x2], #(6*16) // ..................................................e.............'...............................................................................~..... + // ldr q4, [x2, #(-6*16 + 1*16)] // .....................................................e..........'..................................................................................~.. + // ldr q1, [x2, #(-6*16 + 2*16)] // ........................................................e.......'..................................................................................... + // ldr q5, [x2, #(-6*16 + 3*16)] // ..........................................................e.....'..................................................................................... + // ldr q2, [x2, #(-6*16 + 4*16)] // ............................................................e...'..................................................................................... + // ldr q6, [x2, #(-6*16 + 5*16)] // ..............................................................e.'..................................................................................... // sub v24.8h, v8.8h, v9.8h // ................................................................'........*............................................................................ // add v8.8h, v8.8h, v9.8h // ................................................................'.........*........................................................................... // sqrdmulh v27.8h, v24.8h, v5.8h // ................................................................'...........*......................................................................... @@ -572,7 +451,7 @@ layer4567_start: // trn2 v11.2d, v26.2d, v28.2d // ............~...................................................'.........................................*........................................... // trn1 v8.2d, v25.2d, v27.2d // .............~..................................................'..........................................*.......................................... // trn1 v9.2d, v26.2d, v28.2d // ..............~.................................................'...........................................*......................................... - // ldr q0, [x3], #16 // ................................................................'....................*................................................................ + // ldr q0, [x1], #16 // ................................................................'....................*................................................................ // sub v24.8h, v8.8h, v9.8h // ................~...............................................'.............................................*....................................... // add v8.8h, v8.8h, v9.8h // .................~..............................................'..............................................*...................................... // sqrdmulh v27.8h, v24.8h, v0.h[3] // ...................~............................................'................................................*.................................... @@ -605,181 +484,180 @@ layer4567_start: // sqrdmulh v27.8h, v24.8h, v0.h[1] // ...........................................~....................'........................................................................*............ // mul v11.8h, v24.8h, v0.h[0] // ............................................~...................'.........................................................................*........... // mls v11.8h, v27.8h, v7.h[0] // ................................................~...............'.............................................................................*....... - // str q8, [x1], #(64) // ...............................................~................'............................................................................*........ - // str q9, [x1, #(-64 + 16*1)] // .................................................~..............'..............................................................................*...... - // str q10, [x1, #(-64 + 16*2)] // ....................................................~...........'.................................................................................*... - // str q11, [x1, #(-64 + 16*3)] // .......................................................~........'....................................................................................* + // str q8, [x3], #(64) // ...............................................~................'............................................................................*........ + // str q9, [x3, #(-64 + 16*1)] // .................................................~..............'..............................................................................*...... + // str q10, [x3, #(-64 + 16*2)] // ....................................................~...........'.................................................................................*... + // str q11, [x3, #(-64 + 16*3)] // .......................................................~........'....................................................................................* sub count, count, #1 - cbnz count, layer4567_start - // Instructions: 72 - // Expected cycles: 79 - // Expected IPC: 0.91 - // - // Cycle bound: 79.0 - // IPC bound: 0.91 - // - // Wall time: 9.38s - // User time: 9.38s - // - // ------------------------------ cycle (expected) ------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|--- - trn1 v3.4S, v10.4S, v21.4S // *.............................................................................. - trn2 v12.4S, v31.4S, v12.4S // .*............................................................................. - trn2 v13.4S, v10.4S, v21.4S // ..*............................................................................ - trn1 v28.2D, v3.2D, v30.2D // ...*........................................................................... - trn2 v1.2D, v3.2D, v30.2D // ....*.......................................................................... - trn2 v31.2D, v13.2D, v12.2D // .....*......................................................................... - trn1 v12.2D, v13.2D, v12.2D // ......*........................................................................ - sub v10.8H, v1.8H, v31.8H // .......*....................................................................... - sub v13.8H, v28.8H, v12.8H // ........*...................................................................... - add v30.8H, v28.8H, v12.8H // .........*..................................................................... - mul v21.8H, v10.8H, v25.8H // ..........*.................................................................... - mul v3.8H, v13.8H, v15.8H // ...........*................................................................... - sqrdmulh v13.8H, v13.8H, v6.8H // ............*.................................................................. - sqrdmulh v10.8H, v10.8H, v20.8H // .............*................................................................. - add v31.8H, v1.8H, v31.8H // ..............*................................................................ - mls v3.8H, v13.8H, v7.H[0] // ................*.............................................................. - mls v21.8H, v10.8H, v7.H[0] // .................*............................................................. - sub v10.8H, v30.8H, v31.8H // ..................*............................................................ - ldr q15, [x3], #16 // ...................*........................................................... - sub v13.8H, v3.8H, v21.8H // .....................*......................................................... - sqrdmulh v6.8H, v10.8H, v9.8H // ......................*........................................................ - mul v12.8H, v10.8H, v5.8H // .......................*....................................................... - mul v10.8H, v13.8H, v5.8H // ........................*...................................................... - sqrdmulh v13.8H, v13.8H, v9.8H // .........................*..................................................... - add v3.8H, v3.8H, v21.8H // ..........................*.................................................... - add v31.8H, v30.8H, v31.8H // ...........................*................................................... - mls v12.8H, v6.8H, v7.H[0] // ............................*.................................................. - mls v10.8H, v13.8H, v7.H[0] // .............................*................................................. - trn1 v30.4S, v31.4S, v3.4S // ..............................*................................................ - trn2 v21.4S, v31.4S, v3.4S // ...............................*............................................... - trn2 v13.4S, v12.4S, v10.4S // .................................*............................................. - trn1 v10.4S, v12.4S, v10.4S // ..................................*............................................ - trn1 v6.2D, v21.2D, v13.2D // ....................................*.......................................... - trn2 v12.2D, v21.2D, v13.2D // .....................................*......................................... - trn1 v31.2D, v30.2D, v10.2D // ......................................*........................................ - trn2 v30.2D, v30.2D, v10.2D // .......................................*....................................... - sub v13.8H, v31.8H, v6.8H // ........................................*...................................... - add v31.8H, v31.8H, v6.8H // .........................................*..................................... - sub v21.8H, v30.8H, v12.8H // ..........................................*.................................... - sqrdmulh v26.8H, v13.8H, v15.H[3] // ...........................................*................................... - mul v2.8H, v13.8H, v15.H[2] // ............................................*.................................. - mul v9.8H, v21.8H, v15.H[4] // .............................................*................................. - sqrdmulh v10.8H, v21.8H, v15.H[5] // ..............................................*................................ - sqdmulh v3.8H, v31.8H, v7.H[1] // ...............................................*............................... - mls v2.8H, v26.8H, v7.H[0] // ................................................*.............................. - add v12.8H, v30.8H, v12.8H // .................................................*............................. - mls v9.8H, v10.8H, v7.H[0] // ..................................................*............................ - srshr v3.8H, v3.8H, #11 // ...................................................*........................... - sqdmulh v10.8H, v12.8H, v7.H[1] // ....................................................*.......................... - sqdmulh v21.8H, v2.8H, v7.H[1] // .....................................................*......................... - sqdmulh v13.8H, v9.8H, v7.H[1] // ......................................................*........................ - mls v31.8H, v3.8H, v7.H[0] // .......................................................*....................... - srshr v10.8H, v10.8H, #11 // ........................................................*...................... - srshr v16.8H, v21.8H, #11 // .........................................................*..................... - srshr v13.8H, v13.8H, #11 // ..........................................................*.................... - mls v12.8H, v10.8H, v7.H[0] // ...........................................................*................... - mls v2.8H, v16.8H, v7.H[0] // ............................................................*.................. - mls v9.8H, v13.8H, v7.H[0] // .............................................................*................. - sub v10.8H, v31.8H, v12.8H // ...............................................................*............... - add v6.8H, v31.8H, v12.8H // ................................................................*.............. - sub v13.8H, v2.8H, v9.8H // .................................................................*............. - mul v3.8H, v10.8H, v15.H[0] // ..................................................................*............ - sqrdmulh v12.8H, v10.8H, v15.H[1] // ...................................................................*........... - mul v10.8H, v13.8H, v15.H[0] // ....................................................................*.......... - sqrdmulh v21.8H, v13.8H, v15.H[1] // .....................................................................*......... - add v13.8H, v2.8H, v9.8H // ......................................................................*........ - mls v3.8H, v12.8H, v7.H[0] // .......................................................................*....... - str q6, [x1], #(64) // ........................................................................*...... - mls v10.8H, v21.8H, v7.H[0] // .........................................................................*..... - str q13, [x1, #-48] // ..........................................................................*.... - str q3, [x1, #-32] // ............................................................................*.. - str q10, [x1, #-16] // ..............................................................................* + cbnz count, layer3456_start + // Instructions: 72 + // Expected cycles: 79 + // Expected IPC: 0.91 + // + // Cycle bound: 79.0 + // IPC bound: 0.91 + // + // Wall time: 9.28s + // User time: 9.28s + // + // ------------------------------ cycle (expected) ------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|--- + trn1 v11.4S, v26.4S, v8.4S // *.............................................................................. + trn2 v24.4S, v24.4S, v16.4S // .*............................................................................. + trn2 v26.4S, v26.4S, v8.4S // ..*............................................................................ + trn1 v18.2D, v11.2D, v0.2D // ...*........................................................................... + trn2 v11.2D, v11.2D, v0.2D // ....*.......................................................................... + trn2 v12.2D, v26.2D, v24.2D // .....*......................................................................... + trn1 v8.2D, v26.2D, v24.2D // ......*........................................................................ + sub v26.8H, v11.8H, v12.8H // .......*....................................................................... + sub v13.8H, v18.8H, v8.8H // ........*...................................................................... + add v24.8H, v18.8H, v8.8H // .........*..................................................................... + mul v16.8H, v26.8H, v4.8H // ..........*.................................................................... + sqrdmulh v17.8H, v13.8H, v15.8H // ...........*................................................................... + mul v3.8H, v13.8H, v3.8H // ............*.................................................................. + sqrdmulh v26.8H, v26.8H, v28.8H // .............*................................................................. + add v10.8H, v11.8H, v12.8H // ..............*................................................................ + mls v3.8H, v17.8H, v7.H[0] // ................*.............................................................. + mls v16.8H, v26.8H, v7.H[0] // .................*............................................................. + sub v26.8H, v24.8H, v10.8H // ..................*............................................................ + ldr q4, [x1], #16 // ...................*........................................................... + sub v12.8H, v3.8H, v16.8H // .....................*......................................................... + sqrdmulh v15.8H, v26.8H, v6.8H // ......................*........................................................ + mul v11.8H, v26.8H, v9.8H // .......................*....................................................... + mul v8.8H, v12.8H, v9.8H // ........................*...................................................... + sqrdmulh v12.8H, v12.8H, v6.8H // .........................*..................................................... + add v0.8H, v24.8H, v10.8H // ..........................*.................................................... + mls v11.8H, v15.8H, v7.H[0] // ...........................*................................................... + add v6.8H, v3.8H, v16.8H // ............................*.................................................. + mls v8.8H, v12.8H, v7.H[0] // .............................*................................................. + trn2 v26.4S, v0.4S, v6.4S // ...............................*............................................... + trn2 v12.4S, v11.4S, v8.4S // .................................*............................................. + trn1 v3.4S, v11.4S, v8.4S // ..................................*............................................ + trn1 v17.4S, v0.4S, v6.4S // ...................................*........................................... + trn1 v8.2D, v26.2D, v12.2D // ....................................*.......................................... + trn2 v13.2D, v26.2D, v12.2D // .....................................*......................................... + trn1 v11.2D, v17.2D, v3.2D // ......................................*........................................ + trn2 v15.2D, v17.2D, v3.2D // .......................................*....................................... + sub v12.8H, v11.8H, v8.8H // ........................................*...................................... + add v16.8H, v15.8H, v13.8H // .........................................*..................................... + sub v26.8H, v15.8H, v13.8H // ..........................................*.................................... + mul v0.8H, v12.8H, v4.H[2] // ...........................................*................................... + sqrdmulh v9.8H, v12.8H, v4.H[3] // ............................................*.................................. + mul v13.8H, v26.8H, v4.H[4] // .............................................*................................. + sqrdmulh v26.8H, v26.8H, v4.H[5] // ..............................................*................................ + add v24.8H, v11.8H, v8.8H // ...............................................*............................... + mls v0.8H, v9.8H, v7.H[0] // ................................................*.............................. + sqdmulh v12.8H, v16.8H, v7.H[1] // .................................................*............................. + mls v13.8H, v26.8H, v7.H[0] // ..................................................*............................ + sqdmulh v11.8H, v24.8H, v7.H[1] // ...................................................*........................... + sqdmulh v8.8H, v0.8H, v7.H[1] // ....................................................*.......................... + srshr v12.8H, v12.8H, #11 // .....................................................*......................... + sqdmulh v26.8H, v13.8H, v7.H[1] // ......................................................*........................ + srshr v11.8H, v11.8H, #11 // .......................................................*....................... + mls v16.8H, v12.8H, v7.H[0] // ........................................................*...................... + srshr v8.8H, v8.8H, #11 // .........................................................*..................... + srshr v26.8H, v26.8H, #11 // ..........................................................*.................... + mls v24.8H, v11.8H, v7.H[0] // ...........................................................*................... + mls v0.8H, v8.8H, v7.H[0] // ............................................................*.................. + mls v13.8H, v26.8H, v7.H[0] // .............................................................*................. + sub v26.8H, v24.8H, v16.8H // ...............................................................*............... + add v15.8H, v24.8H, v16.8H // ................................................................*.............. + sub v12.8H, v0.8H, v13.8H // .................................................................*............. + mul v11.8H, v26.8H, v4.H[0] // ..................................................................*............ + sqrdmulh v16.8H, v26.8H, v4.H[1] // ...................................................................*........... + mul v26.8H, v12.8H, v4.H[0] // ....................................................................*.......... + sqrdmulh v8.8H, v12.8H, v4.H[1] // .....................................................................*......... + add v12.8H, v0.8H, v13.8H // ......................................................................*........ + mls v11.8H, v16.8H, v7.H[0] // .......................................................................*....... + str q15, [x3], #(64) // ........................................................................*...... + mls v26.8H, v8.8H, v7.H[0] // .........................................................................*..... + str q12, [x3, #-48] // ..........................................................................*.... + str q11, [x3, #-32] // ............................................................................*.. + str q26, [x3, #-16] // ..............................................................................* // ------------------------------ cycle (expected) ------------------------------> // 0 25 50 75 // |------------------------|------------------------|------------------------|--- - // trn1 v13.4S, v10.4S, v21.4S // *.............................................................................. - // trn2 v10.4S, v10.4S, v21.4S // ..*............................................................................ - // trn2 v21.4S, v31.4S, v12.4S // .*............................................................................. - // trn2 v3.2D, v13.2D, v30.2D // ....*.......................................................................... - // trn1 v13.2D, v13.2D, v30.2D // ...*........................................................................... - // trn2 v12.2D, v10.2D, v21.2D // .....*......................................................................... - // trn1 v10.2D, v10.2D, v21.2D // ......*........................................................................ - // sub v21.8H, v3.8H, v12.8H // .......*....................................................................... - // add v3.8H, v3.8H, v12.8H // ..............*................................................................ - // sub v12.8H, v13.8H, v10.8H // ........*...................................................................... - // add v13.8H, v13.8H, v10.8H // .........*..................................................................... - // sqrdmulh v10.8H, v21.8H, v20.8H // .............*................................................................. - // sqrdmulh v6.8H, v12.8H, v6.8H // ............*.................................................................. - // mul v12.8H, v12.8H, v15.8H // ...........*................................................................... - // mul v21.8H, v21.8H, v25.8H // ..........*.................................................................... - // sub v30.8H, v13.8H, v3.8H // ..................*............................................................ - // add v13.8H, v13.8H, v3.8H // ...........................*................................................... - // mls v12.8H, v6.8H, v7.H[0] // ................*.............................................................. - // mls v21.8H, v10.8H, v7.H[0] // .................*............................................................. - // sqrdmulh v10.8H, v30.8H, v9.8H // ......................*........................................................ - // mul v3.8H, v30.8H, v5.8H // .......................*....................................................... - // ldr q6, [x3], #16 // ...................*........................................................... - // sub v30.8H, v12.8H, v21.8H // .....................*......................................................... - // mls v3.8H, v10.8H, v7.H[0] // ............................*.................................................. - // add v10.8H, v12.8H, v21.8H // ..........................*.................................................... - // sqrdmulh v21.8H, v30.8H, v9.8H // .........................*..................................................... - // mul v12.8H, v30.8H, v5.8H // ........................*...................................................... - // trn1 v30.4S, v13.4S, v10.4S // ..............................*................................................ - // trn2 v13.4S, v13.4S, v10.4S // ...............................*............................................... - // mls v12.8H, v21.8H, v7.H[0] // .............................*................................................. - // trn1 v5.4S, v3.4S, v12.4S // ..................................*............................................ - // trn2 v3.4S, v3.4S, v12.4S // .................................*............................................. - // trn2 v9.2D, v30.2D, v5.2D // .......................................*....................................... - // trn2 v15.2D, v13.2D, v3.2D // .....................................*......................................... - // trn1 v30.2D, v30.2D, v5.2D // ......................................*........................................ - // trn1 v13.2D, v13.2D, v3.2D // ....................................*.......................................... - // sub v3.8H, v9.8H, v15.8H // ..........................................*.................................... - // sub v5.8H, v30.8H, v13.8H // ........................................*...................................... - // add v13.8H, v30.8H, v13.8H // .........................................*..................................... - // sqrdmulh v30.8H, v3.8H, v6.H[5] // ..............................................*................................ - // sqrdmulh v25.8H, v5.8H, v6.H[3] // ...........................................*................................... - // mul v5.8H, v5.8H, v6.H[2] // ............................................*.................................. - // mul v3.8H, v3.8H, v6.H[4] // .............................................*................................. - // add v9.8H, v9.8H, v15.8H // .................................................*............................. - // sqdmulh v15.8H, v13.8H, v7.H[1] // ...............................................*............................... - // mls v5.8H, v25.8H, v7.H[0] // ................................................*.............................. - // mls v3.8H, v30.8H, v7.H[0] // ..................................................*............................ - // sqdmulh v30.8H, v9.8H, v7.H[1] // ....................................................*.......................... - // srshr v15.8H, v15.8H, #11 // ...................................................*........................... - // sqdmulh v25.8H, v5.8H, v7.H[1] // .....................................................*......................... - // sqdmulh v20.8H, v3.8H, v7.H[1] // ......................................................*........................ - // mls v13.8H, v15.8H, v7.H[0] // .......................................................*....................... - // srshr v30.8H, v30.8H, #11 // ........................................................*...................... - // srshr v15.8H, v25.8H, #11 // .........................................................*..................... - // srshr v25.8H, v20.8H, #11 // ..........................................................*.................... - // mls v9.8H, v30.8H, v7.H[0] // ...........................................................*................... - // mls v5.8H, v15.8H, v7.H[0] // ............................................................*.................. - // mls v3.8H, v25.8H, v7.H[0] // .............................................................*................. - // sub v15.8H, v13.8H, v9.8H // ...............................................................*............... - // add v13.8H, v13.8H, v9.8H // ................................................................*.............. - // sub v9.8H, v5.8H, v3.8H // .................................................................*............. - // sqrdmulh v25.8H, v15.8H, v6.H[1] // ...................................................................*........... - // mul v15.8H, v15.8H, v6.H[0] // ..................................................................*............ - // sqrdmulh v20.8H, v9.8H, v6.H[1] // .....................................................................*......... - // mul v6.8H, v9.8H, v6.H[0] // ....................................................................*.......... - // add v3.8H, v5.8H, v3.8H // ......................................................................*........ - // mls v15.8H, v25.8H, v7.H[0] // .......................................................................*....... - // str q13, [x1], #(64) // ........................................................................*...... - // mls v6.8H, v20.8H, v7.H[0] // .........................................................................*..... - // str q3, [x1, #-48] // ..........................................................................*.... - // str q15, [x1, #-32] // ............................................................................*.. - // str q6, [x1, #-16] // ..............................................................................* + // trn1 v12.4S, v26.4S, v8.4S // *.............................................................................. + // trn2 v26.4S, v26.4S, v8.4S // ..*............................................................................ + // trn2 v8.4S, v24.4S, v16.4S // .*............................................................................. + // trn2 v11.2D, v12.2D, v0.2D // ....*.......................................................................... + // trn1 v12.2D, v12.2D, v0.2D // ...*........................................................................... + // trn2 v16.2D, v26.2D, v8.2D // .....*......................................................................... + // trn1 v26.2D, v26.2D, v8.2D // ......*........................................................................ + // sub v8.8H, v11.8H, v16.8H // .......*....................................................................... + // add v11.8H, v11.8H, v16.8H // ..............*................................................................ + // sub v16.8H, v12.8H, v26.8H // ........*...................................................................... + // add v12.8H, v12.8H, v26.8H // .........*..................................................................... + // sqrdmulh v26.8H, v8.8H, v28.8H // .............*................................................................. + // sqrdmulh v15.8H, v16.8H, v15.8H // ...........*................................................................... + // mul v16.8H, v16.8H, v3.8H // ............*.................................................................. + // mul v8.8H, v8.8H, v4.8H // ..........*.................................................................... + // sub v0.8H, v12.8H, v11.8H // ..................*............................................................ + // add v12.8H, v12.8H, v11.8H // ..........................*.................................................... + // mls v16.8H, v15.8H, v7.H[0] // ................*.............................................................. + // mls v8.8H, v26.8H, v7.H[0] // .................*............................................................. + // sqrdmulh v26.8H, v0.8H, v6.8H // ......................*........................................................ + // mul v11.8H, v0.8H, v9.8H // .......................*....................................................... + // ldr q15, [x1], #16 // ...................*........................................................... + // sub v0.8H, v16.8H, v8.8H // .....................*......................................................... + // mls v11.8H, v26.8H, v7.H[0] // ...........................*................................................... + // add v26.8H, v16.8H, v8.8H // ............................*.................................................. + // sqrdmulh v8.8H, v0.8H, v6.8H // .........................*..................................................... + // mul v16.8H, v0.8H, v9.8H // ........................*...................................................... + // trn1 v0.4S, v12.4S, v26.4S // ...................................*........................................... + // trn2 v12.4S, v12.4S, v26.4S // ...............................*............................................... + // mls v16.8H, v8.8H, v7.H[0] // .............................*................................................. + // trn1 v9.4S, v11.4S, v16.4S // ..................................*............................................ + // trn2 v11.4S, v11.4S, v16.4S // .................................*............................................. + // trn2 v6.2D, v0.2D, v9.2D // .......................................*....................................... + // trn2 v3.2D, v12.2D, v11.2D // .....................................*......................................... + // trn1 v0.2D, v0.2D, v9.2D // ......................................*........................................ + // trn1 v12.2D, v12.2D, v11.2D // ....................................*.......................................... + // sub v11.8H, v6.8H, v3.8H // ..........................................*.................................... + // sub v9.8H, v0.8H, v12.8H // ........................................*...................................... + // add v12.8H, v0.8H, v12.8H // ...............................................*............................... + // sqrdmulh v0.8H, v11.8H, v15.H[5] // ..............................................*................................ + // sqrdmulh v4.8H, v9.8H, v15.H[3] // ............................................*.................................. + // mul v9.8H, v9.8H, v15.H[2] // ...........................................*................................... + // mul v11.8H, v11.8H, v15.H[4] // .............................................*................................. + // add v6.8H, v6.8H, v3.8H // .........................................*..................................... + // sqdmulh v3.8H, v12.8H, v7.H[1] // ...................................................*........................... + // mls v9.8H, v4.8H, v7.H[0] // ................................................*.............................. + // mls v11.8H, v0.8H, v7.H[0] // ..................................................*............................ + // sqdmulh v0.8H, v6.8H, v7.H[1] // .................................................*............................. + // srshr v3.8H, v3.8H, #11 // .......................................................*....................... + // sqdmulh v4.8H, v9.8H, v7.H[1] // ....................................................*.......................... + // sqdmulh v28.8H, v11.8H, v7.H[1] // ......................................................*........................ + // mls v12.8H, v3.8H, v7.H[0] // ...........................................................*................... + // srshr v0.8H, v0.8H, #11 // .....................................................*......................... + // srshr v3.8H, v4.8H, #11 // .........................................................*..................... + // srshr v4.8H, v28.8H, #11 // ..........................................................*.................... + // mls v6.8H, v0.8H, v7.H[0] // ........................................................*...................... + // mls v9.8H, v3.8H, v7.H[0] // ............................................................*.................. + // mls v11.8H, v4.8H, v7.H[0] // .............................................................*................. + // sub v3.8H, v12.8H, v6.8H // ...............................................................*............... + // add v12.8H, v12.8H, v6.8H // ................................................................*.............. + // sub v6.8H, v9.8H, v11.8H // .................................................................*............. + // sqrdmulh v4.8H, v3.8H, v15.H[1] // ...................................................................*........... + // mul v3.8H, v3.8H, v15.H[0] // ..................................................................*............ + // sqrdmulh v28.8H, v6.8H, v15.H[1] // .....................................................................*......... + // mul v15.8H, v6.8H, v15.H[0] // ....................................................................*.......... + // add v11.8H, v9.8H, v11.8H // ......................................................................*........ + // mls v3.8H, v4.8H, v7.H[0] // .......................................................................*....... + // str q12, [x3], #(64) // ........................................................................*...... + // mls v15.8H, v28.8H, v7.H[0] // .........................................................................*..... + // str q11, [x3, #-48] // ..........................................................................*.... + // str q3, [x3, #-32] // ............................................................................*.. + // str q15, [x3, #-16] // ..............................................................................* // --------------------------------------------------------------------- mov count, #4 - ASM_LOAD(r_ptr0, roots_l012) - load_roots_123 + load_roots_012 .p2align 2 @@ -796,37 +674,37 @@ layer4567_start: // ----- cycle (expected) ------> // 0 25 // |------------------------|---- - ldr q31, [x0, #256] // *............................. - ldr q5, [x0, #320] // ..*........................... - ldr q30, [x0, #128] // ....*......................... - add v25.8H, v31.8H, v5.8H // ......*....................... - ldr q9, [x0, #384] // .......*...................... - ldr q15, [x0, #448] // .........*.................... - ldr q12, [x0, #192] // ...........*.................. - add v20.8H, v9.8H, v15.8H // .............*................ - ldr q3, [x0, #0] // ..............*............... - add v27.8H, v30.8H, v12.8H // ................*............. - add v24.8H, v25.8H, v20.8H // .................*............ - ldr q6, [x0, #64] // ..................*........... + ldr q24, [x0, #128] // *............................. + ldr q16, [x0, #192] // ..*........................... + ldr q9, [x0, #256] // ....*......................... + ldr q6, [x0, #320] // ......*....................... + ldr q3, [x0, #384] // ........*..................... + ldr q4, [x0, #448] // ..........*................... + add v28.8H, v9.8H, v6.8H // ............*................. + add v19.8H, v24.8H, v16.8H // .............*................ + add v13.8H, v3.8H, v4.8H // ..............*............... + ldr q11, [x0, #0] // ...............*.............. + add v23.8H, v28.8H, v13.8H // .................*............ + ldr q15, [x0, #64] // ..................*........... // ------ cycle (expected) ------> // 0 25 // |------------------------|----- - // ldr q3, [x0, #0] // ..............*................ - // ldr q6, [x0, #64] // ..................*............ - // ldr q30, [x0, #128] // ....*.......................... - // ldr q12, [x0, #192] // ...........*................... - // ldr q31, [x0, #256] // *.............................. - // ldr q5, [x0, #320] // ..*............................ - // ldr q9, [x0, #384] // .......*....................... - // ldr q15, [x0, #448] // .........*..................... - // add v25.8H, v31.8H, v5.8H // ......*........................ - // add v20.8H, v9.8H, v15.8H // .............*................. - // add v27.8H, v30.8H, v12.8H // ................*.............. - // add v24.8H, v25.8H, v20.8H // .................*............. + // ldr q11, [x0, #0] // ...............*............... + // ldr q15, [x0, #64] // ..................*............ + // ldr q24, [x0, #128] // *.............................. + // ldr q16, [x0, #192] // ..*............................ + // ldr q9, [x0, #256] // ....*.......................... + // ldr q6, [x0, #320] // ......*........................ + // ldr q3, [x0, #384] // ........*...................... + // ldr q4, [x0, #448] // ..........*.................... + // add v28.8H, v9.8H, v6.8H // ............*.................. + // add v13.8H, v3.8H, v4.8H // ..............*................ + // add v19.8H, v24.8H, v16.8H // .............*................. + // add v23.8H, v28.8H, v13.8H // .................*............. sub count, count, #1 -layer123_start: +layer012_start: // Instructions: 76 // Expected cycles: 84 // Expected IPC: 0.90 @@ -834,88 +712,88 @@ layer123_start: // Cycle bound: 84.0 // IPC bound: 0.90 // - // Wall time: 2.64s - // User time: 2.64s + // Wall time: 2.81s + // User time: 2.81s // // -------------------------------- cycle (expected) ---------------------------------> // 0 25 50 75 // |------------------------|------------------------|------------------------|-------- - sub v13.8H, v3.8H, v6.8H // *................................................................................... - add v10.8H, v3.8H, v6.8H // .*.................................................................................. - sub v21.8H, v30.8H, v12.8H // ..*................................................................................. - sqrdmulh v3.8H, v13.8H, v0.H[7] // ...*................................................................................ - mul v13.8H, v13.8H, v0.H[6] // ....*............................................................................... - sub v12.8H, v10.8H, v27.8H // .....*.............................................................................. - add v10.8H, v10.8H, v27.8H // ......*............................................................................. - sqrdmulh v6.8H, v21.8H, v1.H[1] // .......*............................................................................ - mul v21.8H, v21.8H, v1.H[0] // ........*........................................................................... - mls v13.8H, v3.8H, v7.H[0] // .........*.......................................................................... - sub v3.8H, v31.8H, v5.8H // ..........*......................................................................... - sqrdmulh v30.8H, v12.8H, v0.H[3] // ...........*........................................................................ - mul v12.8H, v12.8H, v0.H[2] // ............*....................................................................... - sub v31.8H, v10.8H, v24.8H // .............*...................................................................... - add v10.8H, v10.8H, v24.8H // ..............*..................................................................... - mls v21.8H, v6.8H, v7.H[0] // ...............*.................................................................... - sqrdmulh v6.8H, v3.8H, v1.H[3] // ................*................................................................... - mul v3.8H, v3.8H, v1.H[2] // .................*.................................................................. - sub v5.8H, v9.8H, v15.8H // ..................*................................................................. - sub v9.8H, v13.8H, v21.8H // ...................*................................................................ - add v13.8H, v13.8H, v21.8H // ....................*............................................................... - mls v3.8H, v6.8H, v7.H[0] // .....................*.............................................................. - sqrdmulh v21.8H, v5.8H, v1.H[5] // ......................*............................................................. - mls v12.8H, v30.8H, v7.H[0] // .......................*............................................................ - mul v6.8H, v5.8H, v1.H[4] // ........................*........................................................... - sqrdmulh v30.8H, v9.8H, v0.H[3] // .........................*.......................................................... - mul v5.8H, v9.8H, v0.H[2] // ..........................*......................................................... - sqrdmulh v9.8H, v31.8H, v0.H[1] // ...........................*........................................................ - mul v31.8H, v31.8H, v0.H[0] // ............................*....................................................... - str q10, [x0], #(16) // .............................*...................................................... - mls v6.8H, v21.8H, v7.H[0] // ..............................*..................................................... - mls v5.8H, v30.8H, v7.H[0] // ...............................*.................................................... - sub v10.8H, v25.8H, v20.8H // ................................*................................................... - mls v31.8H, v9.8H, v7.H[0] // .................................*.................................................. - sub v21.8H, v3.8H, v6.8H // ..................................*................................................. - sqrdmulh v30.8H, v10.8H, v0.H[5] // ...................................*................................................ - mul v10.8H, v10.8H, v0.H[4] // ....................................*............................................... - add v3.8H, v3.8H, v6.8H // .....................................*.............................................. - sqrdmulh v6.8H, v21.8H, v0.H[5] // ......................................*............................................. - mul v21.8H, v21.8H, v0.H[4] // .......................................*............................................ - mls v10.8H, v30.8H, v7.H[0] // ........................................*........................................... - sub v30.8H, v13.8H, v3.8H // .........................................*.......................................... - add v13.8H, v13.8H, v3.8H // ..........................................*......................................... - mls v21.8H, v6.8H, v7.H[0] // ...........................................*........................................ - sqrdmulh v3.8H, v30.8H, v0.H[1] // ............................................*....................................... - mul v6.8H, v30.8H, v0.H[0] // .............................................*...................................... - sub v30.8H, v12.8H, v10.8H // ..............................................*..................................... - add v10.8H, v12.8H, v10.8H // ...............................................*.................................... - sub v12.8H, v5.8H, v21.8H // ................................................*................................... - mls v6.8H, v3.8H, v7.H[0] // .................................................*.................................. - sqrdmulh v3.8H, v30.8H, v0.H[1] // ..................................................*................................. - mul v30.8H, v30.8H, v0.H[0] // ...................................................*................................ - add v21.8H, v5.8H, v21.8H // ....................................................*............................... - sqrdmulh v5.8H, v12.8H, v0.H[1] // .....................................................*.............................. - mul v12.8H, v12.8H, v0.H[0] // ......................................................*............................. - mls v30.8H, v3.8H, v7.H[0] // .......................................................*............................ - str q31, [x0, #240] // ........................................................*........................... - ldr q3, [x0, #0] // .........................................................e.......................... - mls v12.8H, v5.8H, v7.H[0] // ...........................................................*........................ - str q6, [x0, #304] // ............................................................*....................... - ldr q6, [x0, #64] // .............................................................e...................... - str q30, [x0, #368] // ...............................................................*.................... - ldr q30, [x0, #128] // ................................................................e................... - str q12, [x0, #432] // ..................................................................*................. - ldr q12, [x0, #192] // ...................................................................e................ - str q13, [x0, #48] // .....................................................................*.............. - ldr q31, [x0, #256] // ......................................................................e............. - ldr q5, [x0, #320] // ........................................................................e........... - ldr q9, [x0, #384] // ..........................................................................e......... - ldr q15, [x0, #448] // ............................................................................e....... - str q10, [x0, #112] // ..............................................................................*..... - add v25.8H, v31.8H, v5.8H // ...............................................................................e.... - add v20.8H, v9.8H, v15.8H // ................................................................................e... - str q21, [x0, #176] // .................................................................................*.. - add v27.8H, v30.8H, v12.8H // ..................................................................................e. - add v24.8H, v25.8H, v20.8H // ...................................................................................e + sub v12.8H, v11.8H, v15.8H // *................................................................................... + add v26.8H, v11.8H, v15.8H // .*.................................................................................. + sub v8.8H, v24.8H, v16.8H // ..*................................................................................. + sqrdmulh v11.8H, v12.8H, v0.H[7] // ...*................................................................................ + mul v12.8H, v12.8H, v0.H[6] // ....*............................................................................... + sub v16.8H, v26.8H, v19.8H // .....*.............................................................................. + add v26.8H, v26.8H, v19.8H // ......*............................................................................. + sqrdmulh v15.8H, v8.8H, v1.H[1] // .......*............................................................................ + mul v8.8H, v8.8H, v1.H[0] // ........*........................................................................... + mls v12.8H, v11.8H, v7.H[0] // .........*.......................................................................... + sub v11.8H, v9.8H, v6.8H // ..........*......................................................................... + sqrdmulh v24.8H, v16.8H, v0.H[3] // ...........*........................................................................ + mul v16.8H, v16.8H, v0.H[2] // ............*....................................................................... + sub v9.8H, v26.8H, v23.8H // .............*...................................................................... + add v26.8H, v26.8H, v23.8H // ..............*..................................................................... + mls v8.8H, v15.8H, v7.H[0] // ...............*.................................................................... + sqrdmulh v15.8H, v11.8H, v1.H[3] // ................*................................................................... + mul v11.8H, v11.8H, v1.H[2] // .................*.................................................................. + sub v6.8H, v3.8H, v4.8H // ..................*................................................................. + sub v3.8H, v12.8H, v8.8H // ...................*................................................................ + add v12.8H, v12.8H, v8.8H // ....................*............................................................... + mls v11.8H, v15.8H, v7.H[0] // .....................*.............................................................. + sqrdmulh v8.8H, v6.8H, v1.H[5] // ......................*............................................................. + mls v16.8H, v24.8H, v7.H[0] // .......................*............................................................ + mul v15.8H, v6.8H, v1.H[4] // ........................*........................................................... + sqrdmulh v24.8H, v3.8H, v0.H[3] // .........................*.......................................................... + mul v6.8H, v3.8H, v0.H[2] // ..........................*......................................................... + sqrdmulh v3.8H, v9.8H, v0.H[1] // ...........................*........................................................ + mul v9.8H, v9.8H, v0.H[0] // ............................*....................................................... + str q26, [x0], #(16) // .............................*...................................................... + mls v15.8H, v8.8H, v7.H[0] // ..............................*..................................................... + mls v6.8H, v24.8H, v7.H[0] // ...............................*.................................................... + sub v26.8H, v28.8H, v13.8H // ................................*................................................... + mls v9.8H, v3.8H, v7.H[0] // .................................*.................................................. + sub v8.8H, v11.8H, v15.8H // ..................................*................................................. + sqrdmulh v24.8H, v26.8H, v0.H[5] // ...................................*................................................ + mul v26.8H, v26.8H, v0.H[4] // ....................................*............................................... + add v11.8H, v11.8H, v15.8H // .....................................*.............................................. + sqrdmulh v15.8H, v8.8H, v0.H[5] // ......................................*............................................. + mul v8.8H, v8.8H, v0.H[4] // .......................................*............................................ + mls v26.8H, v24.8H, v7.H[0] // ........................................*........................................... + sub v24.8H, v12.8H, v11.8H // .........................................*.......................................... + add v12.8H, v12.8H, v11.8H // ..........................................*......................................... + mls v8.8H, v15.8H, v7.H[0] // ...........................................*........................................ + sqrdmulh v11.8H, v24.8H, v0.H[1] // ............................................*....................................... + mul v15.8H, v24.8H, v0.H[0] // .............................................*...................................... + sub v24.8H, v16.8H, v26.8H // ..............................................*..................................... + add v26.8H, v16.8H, v26.8H // ...............................................*.................................... + sub v16.8H, v6.8H, v8.8H // ................................................*................................... + mls v15.8H, v11.8H, v7.H[0] // .................................................*.................................. + sqrdmulh v11.8H, v24.8H, v0.H[1] // ..................................................*................................. + mul v24.8H, v24.8H, v0.H[0] // ...................................................*................................ + add v8.8H, v6.8H, v8.8H // ....................................................*............................... + sqrdmulh v6.8H, v16.8H, v0.H[1] // .....................................................*.............................. + mul v16.8H, v16.8H, v0.H[0] // ......................................................*............................. + mls v24.8H, v11.8H, v7.H[0] // .......................................................*............................ + str q9, [x0, #240] // ........................................................*........................... + ldr q11, [x0, #0] // .........................................................e.......................... + mls v16.8H, v6.8H, v7.H[0] // ...........................................................*........................ + str q15, [x0, #304] // ............................................................*....................... + ldr q15, [x0, #64] // .............................................................e...................... + str q24, [x0, #368] // ...............................................................*.................... + ldr q24, [x0, #128] // ................................................................e................... + str q16, [x0, #432] // ..................................................................*................. + ldr q16, [x0, #192] // ...................................................................e................ + str q12, [x0, #48] // .....................................................................*.............. + ldr q9, [x0, #256] // ......................................................................e............. + ldr q6, [x0, #320] // ........................................................................e........... + ldr q3, [x0, #384] // ..........................................................................e......... + ldr q4, [x0, #448] // ............................................................................e....... + str q26, [x0, #112] // ..............................................................................*..... + add v28.8H, v9.8H, v6.8H // ...............................................................................e.... + add v13.8H, v3.8H, v4.8H // ................................................................................e... + str q8, [x0, #176] // .................................................................................*.. + add v19.8H, v24.8H, v16.8H // ..................................................................................e. + add v23.8H, v28.8H, v13.8H // ...................................................................................e // --------------------------------------------- cycle (expected) ---------------------------------------------> // 0 25 50 75 100 @@ -998,7 +876,7 @@ layer123_start: // str q11, [x0, #(-16 + 3*(512/8))] // ........................~..'................................................................................* sub count, count, #1 - cbnz count, layer123_start + cbnz count, layer012_start // Instructions: 64 // Expected cycles: 66 // Expected IPC: 0.97 @@ -1006,144 +884,144 @@ layer123_start: // Cycle bound: 66.0 // IPC bound: 0.97 // - // Wall time: 8.20s - // User time: 8.20s + // Wall time: 8.33s + // User time: 8.33s // // ----------------------- cycle (expected) ------------------------> // 0 25 50 // |------------------------|------------------------|--------------- - add v14.8H, v3.8H, v6.8H // *................................................................. - sub v13.8H, v25.8H, v20.8H // .*................................................................ - sub v3.8H, v3.8H, v6.8H // ..*............................................................... - sub v29.8H, v14.8H, v27.8H // ...*.............................................................. - mul v28.8H, v13.8H, v0.H[4] // ....*............................................................. - sqrdmulh v10.8H, v13.8H, v0.H[5] // .....*............................................................ - sqrdmulh v13.8H, v29.8H, v0.H[3] // ......*........................................................... - mul v20.8H, v29.8H, v0.H[2] // .......*.......................................................... - sub v19.8H, v30.8H, v12.8H // ........*......................................................... - sqrdmulh v29.8H, v3.8H, v0.H[7] // .........*........................................................ - mls v28.8H, v10.8H, v7.H[0] // ..........*....................................................... - mls v20.8H, v13.8H, v7.H[0] // ...........*...................................................... - sqrdmulh v22.8H, v19.8H, v1.H[1] // ............*..................................................... - mul v23.8H, v19.8H, v1.H[0] // .............*.................................................... - mul v6.8H, v3.8H, v0.H[6] // ..............*................................................... - sub v13.8H, v20.8H, v28.8H // ...............*.................................................. - sub v15.8H, v9.8H, v15.8H // ................*................................................. - mls v23.8H, v22.8H, v7.H[0] // .................*................................................ - sqrdmulh v10.8H, v13.8H, v0.H[1] // ..................*............................................... - mul v13.8H, v13.8H, v0.H[0] // ...................*.............................................. - mls v6.8H, v29.8H, v7.H[0] // ....................*............................................. - sqrdmulh v21.8H, v15.8H, v1.H[5] // .....................*............................................ - mul v15.8H, v15.8H, v1.H[4] // ......................*........................................... - mls v13.8H, v10.8H, v7.H[0] // .......................*.......................................... - sub v16.8H, v6.8H, v23.8H // ........................*......................................... - sub v25.8H, v31.8H, v5.8H // .........................*........................................ - mls v15.8H, v21.8H, v7.H[0] // ..........................*....................................... - mul v30.8H, v16.8H, v0.H[2] // ...........................*...................................... - sqrdmulh v21.8H, v16.8H, v0.H[3] // ............................*..................................... - sqrdmulh v5.8H, v25.8H, v1.H[3] // .............................*.................................... - add v27.8H, v14.8H, v27.8H // ..............................*................................... - mul v25.8H, v25.8H, v1.H[2] // ...............................*.................................. - mls v30.8H, v21.8H, v7.H[0] // ................................*................................. - sub v3.8H, v27.8H, v24.8H // .................................*................................ - str q13, [x0, #384] // ..................................*............................... - mls v25.8H, v5.8H, v7.H[0] // ...................................*.............................. - sqrdmulh v12.8H, v3.8H, v0.H[1] // ....................................*............................. - mul v31.8H, v3.8H, v0.H[0] // .....................................*............................ - add v5.8H, v6.8H, v23.8H // ......................................*........................... - add v10.8H, v25.8H, v15.8H // .......................................*.......................... - sub v6.8H, v25.8H, v15.8H // ........................................*......................... - mls v31.8H, v12.8H, v7.H[0] // .........................................*........................ - add v9.8H, v5.8H, v10.8H // ..........................................*....................... - mul v21.8H, v6.8H, v0.H[4] // ...........................................*...................... - sqrdmulh v6.8H, v6.8H, v0.H[5] // ............................................*..................... - str q31, [x0, #256] // .............................................*.................... - sub v22.8H, v5.8H, v10.8H // ..............................................*................... - str q9, [x0, #64] // ...............................................*.................. - mls v21.8H, v6.8H, v7.H[0] // ................................................*................. - sqrdmulh v6.8H, v22.8H, v0.H[1] // .................................................*................ - mul v3.8H, v22.8H, v0.H[0] // ..................................................*............... - add v12.8H, v20.8H, v28.8H // ...................................................*.............. - sub v13.8H, v30.8H, v21.8H // ....................................................*............. - add v21.8H, v30.8H, v21.8H // .....................................................*............ - mls v3.8H, v6.8H, v7.H[0] // ......................................................*........... - sqrdmulh v10.8H, v13.8H, v0.H[1] // .......................................................*.......... - mul v13.8H, v13.8H, v0.H[0] // ........................................................*......... - str q21, [x0, #192] // .........................................................*........ - add v6.8H, v27.8H, v24.8H // ..........................................................*....... - str q3, [x0, #320] // ...........................................................*...... - mls v13.8H, v10.8H, v7.H[0] // ............................................................*..... - str q6, [x0], #(16) // .............................................................*.... - str q12, [x0, #112] // ...............................................................*.. - str q13, [x0, #432] // .................................................................* + add v10.8H, v11.8H, v15.8H // *................................................................. + sub v12.8H, v28.8H, v13.8H // .*................................................................ + sub v11.8H, v11.8H, v15.8H // ..*............................................................... + sub v22.8H, v10.8H, v19.8H // ...*.............................................................. + mul v18.8H, v12.8H, v0.H[4] // ....*............................................................. + sqrdmulh v26.8H, v12.8H, v0.H[5] // .....*............................................................ + sqrdmulh v12.8H, v22.8H, v0.H[3] // ......*........................................................... + mul v13.8H, v22.8H, v0.H[2] // .......*.......................................................... + sub v31.8H, v24.8H, v16.8H // ........*......................................................... + sqrdmulh v22.8H, v11.8H, v0.H[7] // .........*........................................................ + mls v18.8H, v26.8H, v7.H[0] // ..........*....................................................... + mls v13.8H, v12.8H, v7.H[0] // ...........*...................................................... + sqrdmulh v2.8H, v31.8H, v1.H[1] // ............*..................................................... + mul v5.8H, v31.8H, v1.H[0] // .............*.................................................... + mul v15.8H, v11.8H, v0.H[6] // ..............*................................................... + sub v12.8H, v13.8H, v18.8H // ...............*.................................................. + sub v4.8H, v3.8H, v4.8H // ................*................................................. + mls v5.8H, v2.8H, v7.H[0] // .................*................................................ + sqrdmulh v26.8H, v12.8H, v0.H[1] // ..................*............................................... + mul v12.8H, v12.8H, v0.H[0] // ...................*.............................................. + mls v15.8H, v22.8H, v7.H[0] // ....................*............................................. + sqrdmulh v8.8H, v4.8H, v1.H[5] // .....................*............................................ + mul v4.8H, v4.8H, v1.H[4] // ......................*........................................... + mls v12.8H, v26.8H, v7.H[0] // .......................*.......................................... + sub v21.8H, v15.8H, v5.8H // ........................*......................................... + sub v28.8H, v9.8H, v6.8H // .........................*........................................ + mls v4.8H, v8.8H, v7.H[0] // ..........................*....................................... + mul v24.8H, v21.8H, v0.H[2] // ...........................*...................................... + sqrdmulh v8.8H, v21.8H, v0.H[3] // ............................*..................................... + sqrdmulh v6.8H, v28.8H, v1.H[3] // .............................*.................................... + add v19.8H, v10.8H, v19.8H // ..............................*................................... + mul v28.8H, v28.8H, v1.H[2] // ...............................*.................................. + mls v24.8H, v8.8H, v7.H[0] // ................................*................................. + sub v11.8H, v19.8H, v23.8H // .................................*................................ + str q12, [x0, #384] // ..................................*............................... + mls v28.8H, v6.8H, v7.H[0] // ...................................*.............................. + sqrdmulh v16.8H, v11.8H, v0.H[1] // ....................................*............................. + mul v9.8H, v11.8H, v0.H[0] // .....................................*............................ + add v6.8H, v15.8H, v5.8H // ......................................*........................... + add v26.8H, v28.8H, v4.8H // .......................................*.......................... + sub v15.8H, v28.8H, v4.8H // ........................................*......................... + mls v9.8H, v16.8H, v7.H[0] // .........................................*........................ + add v3.8H, v6.8H, v26.8H // ..........................................*....................... + mul v8.8H, v15.8H, v0.H[4] // ...........................................*...................... + sqrdmulh v15.8H, v15.8H, v0.H[5] // ............................................*..................... + str q9, [x0, #256] // .............................................*.................... + sub v2.8H, v6.8H, v26.8H // ..............................................*................... + str q3, [x0, #64] // ...............................................*.................. + mls v8.8H, v15.8H, v7.H[0] // ................................................*................. + sqrdmulh v15.8H, v2.8H, v0.H[1] // .................................................*................ + mul v11.8H, v2.8H, v0.H[0] // ..................................................*............... + add v16.8H, v13.8H, v18.8H // ...................................................*.............. + sub v12.8H, v24.8H, v8.8H // ....................................................*............. + add v8.8H, v24.8H, v8.8H // .....................................................*............ + mls v11.8H, v15.8H, v7.H[0] // ......................................................*........... + sqrdmulh v26.8H, v12.8H, v0.H[1] // .......................................................*.......... + mul v12.8H, v12.8H, v0.H[0] // ........................................................*......... + str q8, [x0, #192] // .........................................................*........ + add v15.8H, v19.8H, v23.8H // ..........................................................*....... + str q11, [x0, #320] // ...........................................................*...... + mls v12.8H, v26.8H, v7.H[0] // ............................................................*..... + str q15, [x0], #(16) // .............................................................*.... + str q16, [x0, #112] // ...............................................................*.. + str q12, [x0, #432] // .................................................................* // ----------------------- cycle (expected) ------------------------> // 0 25 50 // |------------------------|------------------------|--------------- - // sub v13.8H, v3.8H, v6.8H // ..*............................................................... - // add v10.8H, v3.8H, v6.8H // *................................................................. - // sub v21.8H, v30.8H, v12.8H // ........*......................................................... - // sqrdmulh v3.8H, v13.8H, v0.H[7] // .........*........................................................ - // mul v13.8H, v13.8H, v0.H[6] // ..............*................................................... - // sub v12.8H, v10.8H, v27.8H // ...*.............................................................. - // add v10.8H, v10.8H, v27.8H // ..............................*................................... - // sqrdmulh v6.8H, v21.8H, v1.H[1] // ............*..................................................... - // mul v21.8H, v21.8H, v1.H[0] // .............*.................................................... - // mls v13.8H, v3.8H, v7.H[0] // ....................*............................................. - // sub v3.8H, v31.8H, v5.8H // .........................*........................................ - // sqrdmulh v30.8H, v12.8H, v0.H[3] // ......*........................................................... - // mul v12.8H, v12.8H, v0.H[2] // .......*.......................................................... - // sub v31.8H, v10.8H, v24.8H // .................................*................................ - // add v10.8H, v10.8H, v24.8H // ..........................................................*....... - // mls v21.8H, v6.8H, v7.H[0] // .................*................................................ - // sqrdmulh v6.8H, v3.8H, v1.H[3] // .............................*.................................... - // mul v3.8H, v3.8H, v1.H[2] // ...............................*.................................. - // sub v5.8H, v9.8H, v15.8H // ................*................................................. - // sub v9.8H, v13.8H, v21.8H // ........................*......................................... - // add v13.8H, v13.8H, v21.8H // ......................................*........................... - // mls v3.8H, v6.8H, v7.H[0] // ...................................*.............................. - // sqrdmulh v21.8H, v5.8H, v1.H[5] // .....................*............................................ - // mls v12.8H, v30.8H, v7.H[0] // ...........*...................................................... - // mul v6.8H, v5.8H, v1.H[4] // ......................*........................................... - // sqrdmulh v30.8H, v9.8H, v0.H[3] // ............................*..................................... - // mul v5.8H, v9.8H, v0.H[2] // ...........................*...................................... - // sqrdmulh v9.8H, v31.8H, v0.H[1] // ....................................*............................. - // mul v31.8H, v31.8H, v0.H[0] // .....................................*............................ - // str q10, [x0], #(16) // .............................................................*.... - // mls v6.8H, v21.8H, v7.H[0] // ..........................*....................................... - // mls v5.8H, v30.8H, v7.H[0] // ................................*................................. - // sub v10.8H, v25.8H, v20.8H // .*................................................................ - // mls v31.8H, v9.8H, v7.H[0] // .........................................*........................ - // sub v21.8H, v3.8H, v6.8H // ........................................*......................... - // sqrdmulh v30.8H, v10.8H, v0.H[5] // .....*............................................................ - // mul v10.8H, v10.8H, v0.H[4] // ....*............................................................. - // add v3.8H, v3.8H, v6.8H // .......................................*.......................... - // sqrdmulh v6.8H, v21.8H, v0.H[5] // ............................................*..................... - // mul v21.8H, v21.8H, v0.H[4] // ...........................................*...................... - // mls v10.8H, v30.8H, v7.H[0] // ..........*....................................................... - // sub v30.8H, v13.8H, v3.8H // ..............................................*................... - // add v13.8H, v13.8H, v3.8H // ..........................................*....................... - // mls v21.8H, v6.8H, v7.H[0] // ................................................*................. - // sqrdmulh v3.8H, v30.8H, v0.H[1] // .................................................*................ - // mul v6.8H, v30.8H, v0.H[0] // ..................................................*............... - // sub v30.8H, v12.8H, v10.8H // ...............*.................................................. - // add v10.8H, v12.8H, v10.8H // ...................................................*.............. - // sub v12.8H, v5.8H, v21.8H // ....................................................*............. - // mls v6.8H, v3.8H, v7.H[0] // ......................................................*........... - // sqrdmulh v3.8H, v30.8H, v0.H[1] // ..................*............................................... - // mul v30.8H, v30.8H, v0.H[0] // ...................*.............................................. - // add v21.8H, v5.8H, v21.8H // .....................................................*............ - // sqrdmulh v5.8H, v12.8H, v0.H[1] // .......................................................*.......... - // mul v12.8H, v12.8H, v0.H[0] // ........................................................*......... - // mls v30.8H, v3.8H, v7.H[0] // .......................*.......................................... - // str q31, [x0, #240] // .............................................*.................... - // mls v12.8H, v5.8H, v7.H[0] // ............................................................*..... - // str q6, [x0, #304] // ...........................................................*...... - // str q30, [x0, #368] // ..................................*............................... - // str q12, [x0, #432] // .................................................................* - // str q13, [x0, #48] // ...............................................*.................. - // str q10, [x0, #112] // ...............................................................*.. - // str q21, [x0, #176] // .........................................................*........ + // sub v12.8H, v11.8H, v15.8H // ..*............................................................... + // add v26.8H, v11.8H, v15.8H // *................................................................. + // sub v8.8H, v24.8H, v16.8H // ........*......................................................... + // sqrdmulh v11.8H, v12.8H, v0.H[7] // .........*........................................................ + // mul v12.8H, v12.8H, v0.H[6] // ..............*................................................... + // sub v16.8H, v26.8H, v19.8H // ...*.............................................................. + // add v26.8H, v26.8H, v19.8H // ..............................*................................... + // sqrdmulh v15.8H, v8.8H, v1.H[1] // ............*..................................................... + // mul v8.8H, v8.8H, v1.H[0] // .............*.................................................... + // mls v12.8H, v11.8H, v7.H[0] // ....................*............................................. + // sub v11.8H, v9.8H, v6.8H // .........................*........................................ + // sqrdmulh v24.8H, v16.8H, v0.H[3] // ......*........................................................... + // mul v16.8H, v16.8H, v0.H[2] // .......*.......................................................... + // sub v9.8H, v26.8H, v23.8H // .................................*................................ + // add v26.8H, v26.8H, v23.8H // ..........................................................*....... + // mls v8.8H, v15.8H, v7.H[0] // .................*................................................ + // sqrdmulh v15.8H, v11.8H, v1.H[3] // .............................*.................................... + // mul v11.8H, v11.8H, v1.H[2] // ...............................*.................................. + // sub v6.8H, v3.8H, v4.8H // ................*................................................. + // sub v3.8H, v12.8H, v8.8H // ........................*......................................... + // add v12.8H, v12.8H, v8.8H // ......................................*........................... + // mls v11.8H, v15.8H, v7.H[0] // ...................................*.............................. + // sqrdmulh v8.8H, v6.8H, v1.H[5] // .....................*............................................ + // mls v16.8H, v24.8H, v7.H[0] // ...........*...................................................... + // mul v15.8H, v6.8H, v1.H[4] // ......................*........................................... + // sqrdmulh v24.8H, v3.8H, v0.H[3] // ............................*..................................... + // mul v6.8H, v3.8H, v0.H[2] // ...........................*...................................... + // sqrdmulh v3.8H, v9.8H, v0.H[1] // ....................................*............................. + // mul v9.8H, v9.8H, v0.H[0] // .....................................*............................ + // str q26, [x0], #(16) // .............................................................*.... + // mls v15.8H, v8.8H, v7.H[0] // ..........................*....................................... + // mls v6.8H, v24.8H, v7.H[0] // ................................*................................. + // sub v26.8H, v28.8H, v13.8H // .*................................................................ + // mls v9.8H, v3.8H, v7.H[0] // .........................................*........................ + // sub v8.8H, v11.8H, v15.8H // ........................................*......................... + // sqrdmulh v24.8H, v26.8H, v0.H[5] // .....*............................................................ + // mul v26.8H, v26.8H, v0.H[4] // ....*............................................................. + // add v11.8H, v11.8H, v15.8H // .......................................*.......................... + // sqrdmulh v15.8H, v8.8H, v0.H[5] // ............................................*..................... + // mul v8.8H, v8.8H, v0.H[4] // ...........................................*...................... + // mls v26.8H, v24.8H, v7.H[0] // ..........*....................................................... + // sub v24.8H, v12.8H, v11.8H // ..............................................*................... + // add v12.8H, v12.8H, v11.8H // ..........................................*....................... + // mls v8.8H, v15.8H, v7.H[0] // ................................................*................. + // sqrdmulh v11.8H, v24.8H, v0.H[1] // .................................................*................ + // mul v15.8H, v24.8H, v0.H[0] // ..................................................*............... + // sub v24.8H, v16.8H, v26.8H // ...............*.................................................. + // add v26.8H, v16.8H, v26.8H // ...................................................*.............. + // sub v16.8H, v6.8H, v8.8H // ....................................................*............. + // mls v15.8H, v11.8H, v7.H[0] // ......................................................*........... + // sqrdmulh v11.8H, v24.8H, v0.H[1] // ..................*............................................... + // mul v24.8H, v24.8H, v0.H[0] // ...................*.............................................. + // add v8.8H, v6.8H, v8.8H // .....................................................*............ + // sqrdmulh v6.8H, v16.8H, v0.H[1] // .......................................................*.......... + // mul v16.8H, v16.8H, v0.H[0] // ........................................................*......... + // mls v24.8H, v11.8H, v7.H[0] // .......................*.......................................... + // str q9, [x0, #240] // .............................................*.................... + // mls v16.8H, v6.8H, v7.H[0] // ............................................................*..... + // str q15, [x0, #304] // ...........................................................*...... + // str q24, [x0, #368] // ..................................*............................... + // str q16, [x0, #432] // .................................................................* + // str q12, [x0, #48] // ...............................................*.................. + // str q26, [x0, #112] // ...............................................................*.. + // str q8, [x0, #176] // .........................................................*........ pop_stack diff --git a/mlkem/native/aarch64/ntt_123_45_67_twiddles.S b/mlkem/native/aarch64/ntt_123_45_67_twiddles.S deleted file mode 100644 index c9efa333c..000000000 --- a/mlkem/native/aarch64/ntt_123_45_67_twiddles.S +++ /dev/null @@ -1,498 +0,0 @@ -/// Copyright (c) 2024 The mlkem-native project authors -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "config.h" -#if defined(MLKEM_USE_NATIVE_AARCH64) - -.p2align 2 -roots_l012: -.short -1600 -.short -15749 -.short -749 -.short -7373 -.short -40 -.short -394 -.short -687 -.short -6762 -.short 630 -.short 6201 -.short -1432 -.short -14095 -.short 848 -.short 8347 -.short 0 -.short 0 -roots_l34: -.short 1062 -.short 10453 -.short 296 -.short 2914 -.short -882 -.short -8682 -.short 0 -.short 0 -.short -1410 -.short -13879 -.short 1339 -.short 13180 -.short 1476 -.short 14529 -.short 0 -.short 0 -.short 193 -.short 1900 -.short -283 -.short -2786 -.short 56 -.short 551 -.short 0 -.short 0 -.short 797 -.short 7845 -.short -1089 -.short -10719 -.short 1333 -.short 13121 -.short 0 -.short 0 -.short -543 -.short -5345 -.short 1426 -.short 14036 -.short -1235 -.short -12156 -.short 0 -.short 0 -.short -69 -.short -679 -.short 535 -.short 5266 -.short -447 -.short -4400 -.short 0 -.short 0 -.short 569 -.short 5601 -.short -936 -.short -9213 -.short -450 -.short -4429 -.short 0 -.short 0 -.short -1583 -.short -15582 -.short -1355 -.short -13338 -.short 821 -.short 8081 -.short 0 -.short 0 -roots_l56: -.short 289 -.short 289 -.short 331 -.short 331 -.short -76 -.short -76 -.short -1573 -.short -1573 -.short 2845 -.short 2845 -.short 3258 -.short 3258 -.short -748 -.short -748 -.short -15483 -.short -15483 -.short 17 -.short 17 -.short 583 -.short 583 -.short 1637 -.short 1637 -.short -1041 -.short -1041 -.short 167 -.short 167 -.short 5739 -.short 5739 -.short 16113 -.short 16113 -.short -10247 -.short -10247 -.short -568 -.short -568 -.short -680 -.short -680 -.short 723 -.short 723 -.short 1100 -.short 1100 -.short -5591 -.short -5591 -.short -6693 -.short -6693 -.short 7117 -.short 7117 -.short 10828 -.short 10828 -.short 1197 -.short 1197 -.short -1025 -.short -1025 -.short -1052 -.short -1052 -.short -1274 -.short -1274 -.short 11782 -.short 11782 -.short -10089 -.short -10089 -.short -10355 -.short -10355 -.short -12540 -.short -12540 -.short 1409 -.short 1409 -.short -48 -.short -48 -.short 756 -.short 756 -.short -314 -.short -314 -.short 13869 -.short 13869 -.short -472 -.short -472 -.short 7441 -.short 7441 -.short -3091 -.short -3091 -.short -667 -.short -667 -.short 233 -.short 233 -.short -1173 -.short -1173 -.short -279 -.short -279 -.short -6565 -.short -6565 -.short 2293 -.short 2293 -.short -11546 -.short -11546 -.short -2746 -.short -2746 -.short 650 -.short 650 -.short -1352 -.short -1352 -.short -816 -.short -816 -.short 632 -.short 632 -.short 6398 -.short 6398 -.short -13308 -.short -13308 -.short -8032 -.short -8032 -.short 6221 -.short 6221 -.short -1626 -.short -1626 -.short -540 -.short -540 -.short -1482 -.short -1482 -.short 1461 -.short 1461 -.short -16005 -.short -16005 -.short -5315 -.short -5315 -.short -14588 -.short -14588 -.short 14381 -.short 14381 -.short 1651 -.short 1651 -.short -1540 -.short -1540 -.short 952 -.short 952 -.short -642 -.short -642 -.short 16251 -.short 16251 -.short -15159 -.short -15159 -.short 9371 -.short 9371 -.short -6319 -.short -6319 -.short -464 -.short -464 -.short 33 -.short 33 -.short 1320 -.short 1320 -.short -1414 -.short -1414 -.short -4567 -.short -4567 -.short 325 -.short 325 -.short 12993 -.short 12993 -.short -13918 -.short -13918 -.short 939 -.short 939 -.short -892 -.short -892 -.short 733 -.short 733 -.short 268 -.short 268 -.short 9243 -.short 9243 -.short -8780 -.short -8780 -.short 7215 -.short 7215 -.short 2638 -.short 2638 -.short -1021 -.short -1021 -.short -941 -.short -941 -.short -992 -.short -992 -.short 641 -.short 641 -.short -10050 -.short -10050 -.short -9262 -.short -9262 -.short -9764 -.short -9764 -.short 6309 -.short 6309 -.short -1010 -.short -1010 -.short 1435 -.short 1435 -.short 807 -.short 807 -.short 452 -.short 452 -.short -9942 -.short -9942 -.short 14125 -.short 14125 -.short 7943 -.short 7943 -.short 4449 -.short 4449 -.short 1584 -.short 1584 -.short -1292 -.short -1292 -.short 375 -.short 375 -.short -1239 -.short -1239 -.short 15592 -.short 15592 -.short -12717 -.short -12717 -.short 3691 -.short 3691 -.short -12196 -.short -12196 -.short -1031 -.short -1031 -.short -109 -.short -109 -.short -780 -.short -780 -.short 1645 -.short 1645 -.short -10148 -.short -10148 -.short -1073 -.short -1073 -.short -7678 -.short -7678 -.short 16192 -.short 16192 -.short 1438 -.short 1438 -.short -461 -.short -461 -.short 1534 -.short 1534 -.short -927 -.short -927 -.short 14155 -.short 14155 -.short -4538 -.short -4538 -.short 15099 -.short 15099 -.short -9125 -.short -9125 -.short 1063 -.short 1063 -.short -556 -.short -556 -.short -1230 -.short -1230 -.short -863 -.short -863 -.short 10463 -.short 10463 -.short -5473 -.short -5473 -.short -12107 -.short -12107 -.short -8495 -.short -8495 -.short 319 -.short 319 -.short 757 -.short 757 -.short 561 -.short 561 -.short -735 -.short -735 -.short 3140 -.short 3140 -.short 7451 -.short 7451 -.short 5522 -.short 5522 -.short -7235 -.short -7235 -.short -682 -.short -682 -.short -712 -.short -712 -.short 1481 -.short 1481 -.short 648 -.short 648 -.short -6713 -.short -6713 -.short -7008 -.short -7008 -.short 14578 -.short 14578 -.short 6378 -.short 6378 -.short -525 -.short -525 -.short 403 -.short 403 -.short 1143 -.short 1143 -.short -554 -.short -554 -.short -5168 -.short -5168 -.short 3967 -.short 3967 -.short 11251 -.short 11251 -.short -5453 -.short -5453 -.short 1092 -.short 1092 -.short 1026 -.short 1026 -.short -1179 -.short -1179 -.short 886 -.short 886 -.short 10749 -.short 10749 -.short 10099 -.short 10099 -.short -11605 -.short -11605 -.short 8721 -.short 8721 -.short -855 -.short -855 -.short -219 -.short -219 -.short 1227 -.short 1227 -.short 910 -.short 910 -.short -8416 -.short -8416 -.short -2156 -.short -2156 -.short 12078 -.short 12078 -.short 8957 -.short 8957 -.short -1607 -.short -1607 -.short -1455 -.short -1455 -.short -1219 -.short -1219 -.short 885 -.short 885 -.short -15818 -.short -15818 -.short -14322 -.short -14322 -.short -11999 -.short -11999 -.short 8711 -.short 8711 -.short 1212 -.short 1212 -.short 1029 -.short 1029 -.short -394 -.short -394 -.short -1175 -.short -1175 -.short 11930 -.short 11930 -.short 10129 -.short 10129 -.short -3878 -.short -3878 -.short -11566 -.short -11566 - -#endif /* MLKEM_USE_NATIVE_AARCH64 */ diff --git a/mlkem/native/aarch64/ntt_clean.S b/mlkem/native/aarch64/ntt_clean.S index 214579f60..4243b9431 100644 --- a/mlkem/native/aarch64/ntt_clean.S +++ b/mlkem/native/aarch64/ntt_clean.S @@ -70,22 +70,22 @@ mls \a\().8h, t0.8h, consts.h[0] .endm -.macro load_roots_123 - ldr q_root0, [r_ptr0], #32 - ldr q_root1, [r_ptr0, #-16] +.macro load_roots_012 + ldr q_root0, [r01234_ptr], #32 + ldr q_root1, [r01234_ptr, #-16] .endm -.macro load_next_roots_45 - ldr q_root0, [r_ptr0], #16 +.macro load_next_roots_34 + ldr q_root0, [r01234_ptr], #16 .endm -.macro load_next_roots_67 - ldr q_root0, [r_ptr1], #(6*16) - ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] - ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] - ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] - ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] - ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.macro load_next_roots_56 + ldr q_root0, [r56_ptr], #(6*16) + ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] + ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] + ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] + ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] + ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -131,16 +131,13 @@ restore_vregs .endm -.data -.p2align 4 -roots: - #include "ntt_123_45_67_twiddles.S" - - in .req x0 - inp .req x1 - count .req x2 - r_ptr0 .req x3 - r_ptr1 .req x4 + // Arguments + in .req x0 // Input/output buffer + r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4 + r56_ptr .req x2 // twiddles for layer 5,6 + + inp .req x3 + count .req x4 xtmp .req x5 data0 .req v8 @@ -202,9 +199,6 @@ const_addr: MLKEM_NAMESPACE(ntt_asm_clean): _MLKEM_NAMESPACE(ntt_asm_clean): push_stack - - ASM_LOAD(r_ptr0, roots) - ASM_LOAD(r_ptr1, roots_l56) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] @@ -212,7 +206,7 @@ _MLKEM_NAMESPACE(ntt_asm_clean): mov inp, in mov count, #4 - load_roots_123 + load_roots_012 .p2align 2 @@ -229,7 +223,7 @@ _MLKEM_NAMESPACE(ntt_asm_clean): // // See test/test_bounds.py for more details. -layer123_start: +layer012_start: ldr q_data0, [in, #0] ldr q_data1, [in, #(1*(512/8))] @@ -265,20 +259,20 @@ layer123_start: str q_data7, [in, #(-16 + 7*(512/8))] subs count, count, #1 - cbnz count, layer123_start + cbnz count, layer012_start mov in, inp mov count, #8 .p2align 2 -layer4567_start: +layer3456_start: ldr q_data0, [in, #(16*0)] ldr q_data1, [in, #(16*1)] ldr q_data2, [in, #(16*2)] ldr q_data3, [in, #(16*3)] - load_next_roots_45 + load_next_roots_34 ct_butterfly data0, data2, root0, 0, 1 ct_butterfly data1, data3, root0, 0, 1 @@ -286,7 +280,7 @@ layer4567_start: ct_butterfly data2, data3, root0, 4, 5 transpose4 data - load_next_roots_67 + load_next_roots_56 ct_butterfly_v data0, data2, root0, root0_tw ct_butterfly_v data1, data3, root0, root0_tw @@ -296,7 +290,7 @@ layer4567_start: st4 {data0.4S, data1.4S, data2.4S, data3.4S}, [in], #64 subs count, count, #1 - cbnz count, layer4567_start + cbnz count, layer3456_start pop_stack ret diff --git a/mlkem/native/aarch64/ntt_opt.S b/mlkem/native/aarch64/ntt_opt.S index 456c2ab82..71779afe9 100644 --- a/mlkem/native/aarch64/ntt_opt.S +++ b/mlkem/native/aarch64/ntt_opt.S @@ -1,8 +1,9 @@ -/// Copyright (c) 2024 The mlkem-native project authors +/// /// Copyright (c) 2022 Arm Limited /// Copyright (c) 2022 Hanno Becker /// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// SPDX-License-Identifier: MIT +/// Copyright (c) 2024 The mlkem-native project authors +// SPDX-License-Identifier: MIT /// /// Permission is hereby granted, free of charge, to any person obtaining a copy /// of this software and associated documentation files (the "Software"), to deal @@ -32,7 +33,7 @@ // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) -// + // See mlken/reduce.c and test/test_bounds.py for more details. .macro mulmodq dst, src, const, idx0, idx1 // Signed barrett multiplication using @@ -69,22 +70,22 @@ mls \a\().8h, t0.8h, consts.h[0] .endm -.macro load_roots_123 - ldr q_root0, [r_ptr0], #32 - ldr q_root1, [r_ptr0, #-16] +.macro load_roots_012 + ldr q_root0, [r01234_ptr], #32 + ldr q_root1, [r01234_ptr, #-16] .endm -.macro load_next_roots_45 - ldr q_root0, [r_ptr0], #16 +.macro load_next_roots_34 + ldr q_root0, [r01234_ptr], #16 .endm -.macro load_next_roots_67 - ldr q_root0, [r_ptr1], #(6*16) - ldr q_root0_tw, [r_ptr1, #(-6*16 + 1*16)] - ldr q_root1, [r_ptr1, #(-6*16 + 2*16)] - ldr q_root1_tw, [r_ptr1, #(-6*16 + 3*16)] - ldr q_root2, [r_ptr1, #(-6*16 + 4*16)] - ldr q_root2_tw, [r_ptr1, #(-6*16 + 5*16)] +.macro load_next_roots_45 + ldr q_root0, [r56_ptr], #(6*16) + ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] + ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] + ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] + ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] + ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] .endm .macro transpose4 data @@ -130,16 +131,13 @@ restore_vregs .endm -.data -.p2align 4 -roots: - #include "ntt_123_45_67_twiddles.S" - - in .req x0 - inp .req x1 - count .req x2 - r_ptr0 .req x3 - r_ptr1 .req x4 + // Arguments + in .req x0 // Input/output buffer + r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4 + r56_ptr .req x2 // twiddles for layer 5,6 + + inp .req x3 + count .req x4 xtmp .req x5 data0 .req v8 @@ -201,9 +199,6 @@ const_addr: MLKEM_NAMESPACE(ntt_asm_opt): _MLKEM_NAMESPACE(ntt_asm_opt): push_stack - - ASM_LOAD(r_ptr0, roots) - ASM_LOAD(r_ptr1, roots_l56) ASM_LOAD(xtmp, const_addr) ld1 {consts.8h}, [xtmp] @@ -211,7 +206,7 @@ _MLKEM_NAMESPACE(ntt_asm_opt): mov inp, in mov count, #4 - load_roots_123 + load_roots_012 .p2align 2 @@ -226,46 +221,48 @@ _MLKEM_NAMESPACE(ntt_asm_opt): // where f(C) = 1/2 + 1.0508*C. // For N=7, we get the bound of f^7(1) * q < 18295. - // Instructions: 10 - // Expected cycles: 17 - // Expected IPC: 0.59 - // - // Cycle bound: 17.0 - // IPC bound: 0.59 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q20, [x0, #0] // *............................. - ldr q27, [x0, #64] // ..*........................... - ldr q15, [x0, #128] // ....*......................... - ldr q21, [x0, #192] // ......*....................... - ldr q19, [x0, #256] // ........*..................... - ldr q28, [x0, #448] // ..........*................... - mul v6.8H, v19.8H, v0.H[0] // ............*................. - ldr q29, [x0, #320] // .............*................ - mul v3.8H, v28.8H, v0.H[0] // ...............*.............. - ldr q2, [x0, #384] // ................*............. - - // ------ cycle (expected) ------> + // See test/test_bounds.py for more details. + + // Instructions: 10 + // Expected cycles: 17 + // Expected IPC: 0.59 + // + // Cycle bound: 17.0 + // IPC bound: 0.59 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> // 0 25 - // |------------------------|----- - // ldr q20, [x0, #0] // *.............................. - // ldr q27, [x0, #64] // ..*............................ - // ldr q15, [x0, #128] // ....*.......................... - // ldr q21, [x0, #192] // ......*........................ - // ldr q19, [x0, #256] // ........*...................... - // ldr q29, [x0, #320] // .............*................. - // mul v6.8H, v19.8H, v0.H[0] // ............*.................. - // ldr q28, [x0, #448] // ..........*.................... - // ldr q2, [x0, #384] // ................*.............. - // mul v3.8H, v28.8H, v0.H[0] // ...............*............... + // |------------------------|---- + ldr q30, [x0, #0] // *............................. + ldr q2, [x0, #64] // ..*........................... + ldr q21, [x0, #128] // ....*......................... + ldr q22, [x0, #192] // ......*....................... + ldr q29, [x0, #256] // ........*..................... + ldr q31, [x0, #448] // ..........*................... + mul v18.8H, v29.8H, v0.H[0] // ............*................. + ldr q5, [x0, #320] // .............*................ + mul v15.8H, v31.8H, v0.H[0] // ...............*.............. + ldr q20, [x0, #384] // ................*............. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q30, [x0, #0] // *.............................. + // ldr q2, [x0, #64] // ..*............................ + // ldr q21, [x0, #128] // ....*.......................... + // ldr q22, [x0, #192] // ......*........................ + // ldr q29, [x0, #256] // ........*...................... + // ldr q5, [x0, #320] // .............*................. + // mul v18.8H, v29.8H, v0.H[0] // ............*.................. + // ldr q31, [x0, #448] // ..........*.................... + // ldr q20, [x0, #384] // ................*.............. + // mul v15.8H, v31.8H, v0.H[0] // ...............*............... sub count, count, #1 -layer123_start: +layer012_start: // Instructions: 76 // Expected cycles: 84 // Expected IPC: 0.90 @@ -273,88 +270,88 @@ layer123_start: // Cycle bound: 84.0 // IPC bound: 0.90 // - // Wall time: 2.35s - // User time: 2.35s + // Wall time: 2.34s + // User time: 2.34s // // -------------------------------- cycle (expected) ---------------------------------> // 0 25 50 75 // |------------------------|------------------------|------------------------|-------- - sqrdmulh v30.8H, v19.8H, v0.H[1] // *................................................................................... - sqrdmulh v19.8H, v29.8H, v0.H[1] // .*.................................................................................. - mul v29.8H, v29.8H, v0.H[0] // ..*................................................................................. - sqrdmulh v23.8H, v2.8H, v0.H[1] // ...*................................................................................ - mls v6.8H, v30.8H, v7.H[0] // ....*............................................................................... - mul v30.8H, v2.8H, v0.H[0] // .....*.............................................................................. - mls v29.8H, v19.8H, v7.H[0] // ......*............................................................................. - sqrdmulh v19.8H, v28.8H, v0.H[1] // .......*............................................................................ - sub v28.8H, v20.8H, v6.8H // ........*........................................................................... - mls v30.8H, v23.8H, v7.H[0] // .........*.......................................................................... - sub v23.8H, v27.8H, v29.8H // ..........*......................................................................... - add v29.8H, v27.8H, v29.8H // ...........*........................................................................ - add v6.8H, v20.8H, v6.8H // ............*....................................................................... - sub v2.8H, v15.8H, v30.8H // .............*...................................................................... - add v30.8H, v15.8H, v30.8H // ..............*..................................................................... - mls v3.8H, v19.8H, v7.H[0] // ...............*.................................................................... - sqrdmulh v19.8H, v2.8H, v0.H[5] // ................*................................................................... - mul v2.8H, v2.8H, v0.H[4] // .................*.................................................................. - sqrdmulh v20.8H, v30.8H, v0.H[3] // ..................*................................................................. - sub v27.8H, v21.8H, v3.8H // ...................*................................................................ - add v3.8H, v21.8H, v3.8H // ....................*............................................................... - mls v2.8H, v19.8H, v7.H[0] // .....................*.............................................................. - sqrdmulh v19.8H, v27.8H, v0.H[5] // ......................*............................................................. - mul v27.8H, v27.8H, v0.H[4] // .......................*............................................................ - mul v30.8H, v30.8H, v0.H[2] // ........................*........................................................... - sub v15.8H, v28.8H, v2.8H // .........................*.......................................................... - add v28.8H, v28.8H, v2.8H // ..........................*......................................................... - mls v27.8H, v19.8H, v7.H[0] // ...........................*........................................................ - sqrdmulh v19.8H, v3.8H, v0.H[3] // ............................*....................................................... - mul v2.8H, v3.8H, v0.H[2] // .............................*...................................................... - mls v30.8H, v20.8H, v7.H[0] // ..............................*..................................................... - sub v3.8H, v23.8H, v27.8H // ...............................*.................................................... - add v23.8H, v23.8H, v27.8H // ................................*................................................... - mls v2.8H, v19.8H, v7.H[0] // .................................*.................................................. - sub v19.8H, v6.8H, v30.8H // ..................................*................................................. - add v30.8H, v6.8H, v30.8H // ...................................*................................................ - sqrdmulh v6.8H, v23.8H, v1.H[3] // ....................................*............................................... - sub v20.8H, v29.8H, v2.8H // .....................................*.............................................. - add v29.8H, v29.8H, v2.8H // ......................................*............................................. - mul v23.8H, v23.8H, v1.H[2] // .......................................*............................................ - sqrdmulh v2.8H, v20.8H, v1.H[1] // ........................................*........................................... - sqrdmulh v27.8H, v29.8H, v0.H[7] // .........................................*.......................................... - mul v29.8H, v29.8H, v0.H[6] // ..........................................*......................................... - mul v20.8H, v20.8H, v1.H[0] // ...........................................*........................................ - mls v23.8H, v6.8H, v7.H[0] // ............................................*....................................... - sqrdmulh v6.8H, v3.8H, v1.H[5] // .............................................*...................................... - mls v29.8H, v27.8H, v7.H[0] // ..............................................*..................................... - mls v20.8H, v2.8H, v7.H[0] // ...............................................*.................................... - sub v2.8H, v28.8H, v23.8H // ................................................*................................... - add v23.8H, v28.8H, v23.8H // .................................................*.................................. - sub v28.8H, v30.8H, v29.8H // ..................................................*................................. - mul v3.8H, v3.8H, v1.H[4] // ...................................................*................................ - add v30.8H, v30.8H, v29.8H // ....................................................*............................... - sub v29.8H, v19.8H, v20.8H // .....................................................*.............................. - add v19.8H, v19.8H, v20.8H // ......................................................*............................. - mls v3.8H, v6.8H, v7.H[0] // .......................................................*............................ - str q30, [x0], #(16) // ........................................................*........................... - ldr q20, [x0, #0] // .........................................................e.......................... - sub v30.8H, v15.8H, v3.8H // ...........................................................*........................ - add v6.8H, v15.8H, v3.8H // ............................................................*....................... - str q28, [x0, #48] // .............................................................*...................... - ldr q27, [x0, #64] // ..............................................................e..................... - str q19, [x0, #112] // ................................................................*................... - ldr q15, [x0, #128] // .................................................................e.................. - str q29, [x0, #176] // ...................................................................*................ - ldr q21, [x0, #192] // ....................................................................e............... - str q23, [x0, #240] // ......................................................................*............. - ldr q19, [x0, #256] // .......................................................................e............ - str q2, [x0, #304] // .........................................................................*.......... - ldr q29, [x0, #320] // ..........................................................................e......... - str q6, [x0, #368] // ............................................................................*....... - mul v6.8H, v19.8H, v0.H[0] // .............................................................................e...... - str q30, [x0, #432] // ..............................................................................*..... - ldr q28, [x0, #448] // ...............................................................................e.... - ldr q2, [x0, #384] // .................................................................................e.. - mul v3.8H, v28.8H, v0.H[0] // ...................................................................................e + sqrdmulh v9.8H, v29.8H, v0.H[1] // *................................................................................... + sqrdmulh v29.8H, v5.8H, v0.H[1] // .*.................................................................................. + mul v5.8H, v5.8H, v0.H[0] // ..*................................................................................. + sqrdmulh v4.8H, v20.8H, v0.H[1] // ...*................................................................................ + mls v18.8H, v9.8H, v7.H[0] // ....*............................................................................... + mul v9.8H, v20.8H, v0.H[0] // .....*.............................................................................. + mls v5.8H, v29.8H, v7.H[0] // ......*............................................................................. + sqrdmulh v29.8H, v31.8H, v0.H[1] // .......*............................................................................ + sub v31.8H, v30.8H, v18.8H // ........*........................................................................... + mls v9.8H, v4.8H, v7.H[0] // .........*.......................................................................... + sub v4.8H, v2.8H, v5.8H // ..........*......................................................................... + add v5.8H, v2.8H, v5.8H // ...........*........................................................................ + add v18.8H, v30.8H, v18.8H // ............*....................................................................... + sub v20.8H, v21.8H, v9.8H // .............*...................................................................... + add v9.8H, v21.8H, v9.8H // ..............*..................................................................... + mls v15.8H, v29.8H, v7.H[0] // ...............*.................................................................... + sqrdmulh v29.8H, v20.8H, v0.H[5] // ................*................................................................... + mul v20.8H, v20.8H, v0.H[4] // .................*.................................................................. + sqrdmulh v30.8H, v9.8H, v0.H[3] // ..................*................................................................. + sub v2.8H, v22.8H, v15.8H // ...................*................................................................ + add v15.8H, v22.8H, v15.8H // ....................*............................................................... + mls v20.8H, v29.8H, v7.H[0] // .....................*.............................................................. + sqrdmulh v29.8H, v2.8H, v0.H[5] // ......................*............................................................. + mul v2.8H, v2.8H, v0.H[4] // .......................*............................................................ + mul v9.8H, v9.8H, v0.H[2] // ........................*........................................................... + sub v21.8H, v31.8H, v20.8H // .........................*.......................................................... + add v31.8H, v31.8H, v20.8H // ..........................*......................................................... + mls v2.8H, v29.8H, v7.H[0] // ...........................*........................................................ + sqrdmulh v29.8H, v15.8H, v0.H[3] // ............................*....................................................... + mul v20.8H, v15.8H, v0.H[2] // .............................*...................................................... + mls v9.8H, v30.8H, v7.H[0] // ..............................*..................................................... + sub v15.8H, v4.8H, v2.8H // ...............................*.................................................... + add v4.8H, v4.8H, v2.8H // ................................*................................................... + mls v20.8H, v29.8H, v7.H[0] // .................................*.................................................. + sub v29.8H, v18.8H, v9.8H // ..................................*................................................. + add v9.8H, v18.8H, v9.8H // ...................................*................................................ + sqrdmulh v18.8H, v4.8H, v1.H[3] // ....................................*............................................... + sub v30.8H, v5.8H, v20.8H // .....................................*.............................................. + add v5.8H, v5.8H, v20.8H // ......................................*............................................. + mul v4.8H, v4.8H, v1.H[2] // .......................................*............................................ + sqrdmulh v20.8H, v30.8H, v1.H[1] // ........................................*........................................... + sqrdmulh v2.8H, v5.8H, v0.H[7] // .........................................*.......................................... + mul v5.8H, v5.8H, v0.H[6] // ..........................................*......................................... + mul v30.8H, v30.8H, v1.H[0] // ...........................................*........................................ + mls v4.8H, v18.8H, v7.H[0] // ............................................*....................................... + sqrdmulh v18.8H, v15.8H, v1.H[5] // .............................................*...................................... + mls v5.8H, v2.8H, v7.H[0] // ..............................................*..................................... + mls v30.8H, v20.8H, v7.H[0] // ...............................................*.................................... + sub v20.8H, v31.8H, v4.8H // ................................................*................................... + add v4.8H, v31.8H, v4.8H // .................................................*.................................. + sub v31.8H, v9.8H, v5.8H // ..................................................*................................. + mul v15.8H, v15.8H, v1.H[4] // ...................................................*................................ + add v9.8H, v9.8H, v5.8H // ....................................................*............................... + sub v5.8H, v29.8H, v30.8H // .....................................................*.............................. + add v29.8H, v29.8H, v30.8H // ......................................................*............................. + mls v15.8H, v18.8H, v7.H[0] // .......................................................*............................ + str q9, [x0], #(16) // ........................................................*........................... + ldr q30, [x0, #0] // .........................................................e.......................... + sub v9.8H, v21.8H, v15.8H // ...........................................................*........................ + add v18.8H, v21.8H, v15.8H // ............................................................*....................... + str q31, [x0, #48] // .............................................................*...................... + ldr q2, [x0, #64] // ..............................................................e..................... + str q29, [x0, #112] // ................................................................*................... + ldr q21, [x0, #128] // .................................................................e.................. + str q5, [x0, #176] // ...................................................................*................ + ldr q22, [x0, #192] // ....................................................................e............... + str q4, [x0, #240] // ......................................................................*............. + ldr q29, [x0, #256] // .......................................................................e............ + str q20, [x0, #304] // .........................................................................*.......... + ldr q5, [x0, #320] // ..........................................................................e......... + str q18, [x0, #368] // ............................................................................*....... + mul v18.8H, v29.8H, v0.H[0] // .............................................................................e...... + str q9, [x0, #432] // ..............................................................................*..... + ldr q31, [x0, #448] // ...............................................................................e.... + ldr q20, [x0, #384] // .................................................................................e.. + mul v15.8H, v31.8H, v0.H[0] // ...................................................................................e // ------------------------------------------- cycle (expected) --------------------------------------------> // 0 25 50 75 100 @@ -437,7 +434,7 @@ layer123_start: // str q15, [x0, #(-16 + 7*(512/8))] // .....................~.....'.............................................................................* sub count, count, #1 - cbnz count, layer123_start + cbnz count, layer012_start // Instructions: 66 // Expected cycles: 67 // Expected IPC: 0.99 @@ -445,451 +442,451 @@ layer123_start: // Cycle bound: 67.0 // IPC bound: 0.99 // - // Wall time: 7.16s - // User time: 7.16s + // Wall time: 7.31s + // User time: 7.31s // // ------------------------ cycle (expected) ------------------------> // 0 25 50 // |------------------------|------------------------|---------------- - sqrdmulh v23.8H, v28.8H, v0.H[1] // *.................................................................. - sqrdmulh v28.8H, v2.8H, v0.H[1] // .*................................................................. - mul v5.8H, v29.8H, v0.H[0] // ..*................................................................ - mul v2.8H, v2.8H, v0.H[0] // ...*............................................................... - mls v3.8H, v23.8H, v7.H[0] // ....*.............................................................. - sqrdmulh v13.8H, v29.8H, v0.H[1] // .....*............................................................. - sqrdmulh v30.8H, v19.8H, v0.H[1] // ......*............................................................ - mls v2.8H, v28.8H, v7.H[0] // .......*........................................................... - sub v23.8H, v21.8H, v3.8H // ........*.......................................................... - mls v5.8H, v13.8H, v7.H[0] // .........*......................................................... - mls v6.8H, v30.8H, v7.H[0] // ..........*........................................................ - mul v29.8H, v23.8H, v0.H[4] // ...........*....................................................... - sqrdmulh v30.8H, v23.8H, v0.H[5] // ............*...................................................... - add v8.8H, v15.8H, v2.8H // .............*..................................................... - sub v19.8H, v15.8H, v2.8H // ..............*.................................................... - sub v18.8H, v27.8H, v5.8H // ...............*................................................... - mls v29.8H, v30.8H, v7.H[0] // ................*.................................................. - mul v2.8H, v19.8H, v0.H[4] // .................*................................................. - sqrdmulh v13.8H, v19.8H, v0.H[5] // ..................*................................................ - add v31.8H, v20.8H, v6.8H // ...................*............................................... - sub v30.8H, v18.8H, v29.8H // ....................*.............................................. - sub v12.8H, v20.8H, v6.8H // .....................*............................................. - sqrdmulh v23.8H, v8.8H, v0.H[3] // ......................*............................................ - sqrdmulh v19.8H, v30.8H, v1.H[5] // .......................*........................................... - mul v28.8H, v30.8H, v1.H[4] // ........................*.......................................... - mls v2.8H, v13.8H, v7.H[0] // .........................*......................................... - mul v15.8H, v8.8H, v0.H[2] // ..........................*........................................ - add v24.8H, v21.8H, v3.8H // ...........................*....................................... - mls v28.8H, v19.8H, v7.H[0] // ............................*...................................... - sub v8.8H, v12.8H, v2.8H // .............................*..................................... - mls v15.8H, v23.8H, v7.H[0] // ..............................*.................................... - sqrdmulh v11.8H, v24.8H, v0.H[3] // ...............................*................................... - sub v30.8H, v8.8H, v28.8H // ................................*.................................. - mul v13.8H, v24.8H, v0.H[2] // .................................*................................. - sub v20.8H, v31.8H, v15.8H // ..................................*................................ - str q30, [x0, #448] // ...................................*............................... - add v24.8H, v8.8H, v28.8H // ....................................*.............................. - mls v13.8H, v11.8H, v7.H[0] // .....................................*............................. - add v3.8H, v27.8H, v5.8H // ......................................*............................ - str q24, [x0, #384] // .......................................*........................... - add v4.8H, v31.8H, v15.8H // ........................................*.......................... - sub v11.8H, v3.8H, v13.8H // .........................................*......................... - add v19.8H, v3.8H, v13.8H // ..........................................*........................ - add v10.8H, v18.8H, v29.8H // ...........................................*....................... - sqrdmulh v30.8H, v11.8H, v1.H[1] // ............................................*...................... - mul v13.8H, v11.8H, v1.H[0] // .............................................*..................... - mul v6.8H, v19.8H, v0.H[6] // ..............................................*.................... - sqrdmulh v26.8H, v19.8H, v0.H[7] // ...............................................*................... - mul v23.8H, v10.8H, v1.H[2] // ................................................*.................. - mls v13.8H, v30.8H, v7.H[0] // .................................................*................. - sqrdmulh v19.8H, v10.8H, v1.H[3] // ..................................................*................ - mls v6.8H, v26.8H, v7.H[0] // ...................................................*............... - add v21.8H, v12.8H, v2.8H // ....................................................*.............. - add v30.8H, v20.8H, v13.8H // .....................................................*............. - mls v23.8H, v19.8H, v7.H[0] // ......................................................*............ - sub v29.8H, v20.8H, v13.8H // .......................................................*........... - str q30, [x0, #128] // ........................................................*.......... - add v5.8H, v4.8H, v6.8H // .........................................................*......... - str q29, [x0, #192] // ..........................................................*........ - sub v19.8H, v4.8H, v6.8H // ...........................................................*....... - str q5, [x0], #(16) // ............................................................*...... - sub v30.8H, v21.8H, v23.8H // .............................................................*..... - str q19, [x0, #48] // ..............................................................*.... - add v2.8H, v21.8H, v23.8H // ...............................................................*... - str q30, [x0, #304] // ................................................................*.. - str q2, [x0, #240] // ..................................................................* + sqrdmulh v13.8H, v31.8H, v0.H[1] // *.................................................................. + mul v8.8H, v20.8H, v0.H[0] // .*................................................................. + sqrdmulh v26.8H, v20.8H, v0.H[1] // ..*................................................................ + mul v31.8H, v5.8H, v0.H[0] // ...*............................................................... + mls v15.8H, v13.8H, v7.H[0] // ....*.............................................................. + sqrdmulh v4.8H, v5.8H, v0.H[1] // .....*............................................................. + mls v8.8H, v26.8H, v7.H[0] // ......*............................................................ + sqrdmulh v6.8H, v29.8H, v0.H[1] // .......*........................................................... + add v17.8H, v22.8H, v15.8H // ........*.......................................................... + mls v31.8H, v4.8H, v7.H[0] // .........*......................................................... + sub v3.8H, v21.8H, v8.8H // ..........*........................................................ + sqrdmulh v5.8H, v17.8H, v0.H[3] // ...........*....................................................... + mul v29.8H, v17.8H, v0.H[2] // ............*...................................................... + mul v20.8H, v3.8H, v0.H[4] // .............*..................................................... + sqrdmulh v4.8H, v3.8H, v0.H[5] // ..............*.................................................... + mls v18.8H, v6.8H, v7.H[0] // ...............*................................................... + mls v29.8H, v5.8H, v7.H[0] // ................*.................................................. + add v13.8H, v2.8H, v31.8H // .................*................................................. + mls v20.8H, v4.8H, v7.H[0] // ..................*................................................ + sub v16.8H, v30.8H, v18.8H // ...................*............................................... + add v25.8H, v21.8H, v8.8H // ....................*.............................................. + sub v9.8H, v13.8H, v29.8H // .....................*............................................. + add v21.8H, v16.8H, v20.8H // ......................*............................................ + sub v10.8H, v16.8H, v20.8H // .......................*........................................... + mul v4.8H, v9.8H, v1.H[0] // ........................*.......................................... + sqrdmulh v16.8H, v25.8H, v0.H[3] // .........................*......................................... + mul v26.8H, v25.8H, v0.H[2] // ..........................*........................................ + sqrdmulh v5.8H, v9.8H, v1.H[1] // ...........................*....................................... + sub v9.8H, v22.8H, v15.8H // ............................*...................................... + add v27.8H, v30.8H, v18.8H // .............................*..................................... + mls v26.8H, v16.8H, v7.H[0] // ..............................*.................................... + sqrdmulh v16.8H, v9.8H, v0.H[5] // ...............................*................................... + mul v20.8H, v9.8H, v0.H[4] // ................................*.................................. + mls v4.8H, v5.8H, v7.H[0] // .................................*................................. + sub v6.8H, v27.8H, v26.8H // ..................................*................................ + sub v18.8H, v2.8H, v31.8H // ...................................*............................... + mls v20.8H, v16.8H, v7.H[0] // ....................................*.............................. + sub v5.8H, v6.8H, v4.8H // .....................................*............................. + add v9.8H, v6.8H, v4.8H // ......................................*............................ + add v4.8H, v13.8H, v29.8H // .......................................*........................... + str q5, [x0, #192] // ........................................*.......................... + add v5.8H, v18.8H, v20.8H // .........................................*......................... + str q9, [x0, #128] // ..........................................*........................ + sub v20.8H, v18.8H, v20.8H // ...........................................*....................... + sqrdmulh v2.8H, v5.8H, v1.H[3] // ............................................*...................... + mul v15.8H, v5.8H, v1.H[2] // .............................................*..................... + add v6.8H, v27.8H, v26.8H // ..............................................*.................... + sqrdmulh v29.8H, v20.8H, v1.H[5] // ...............................................*................... + mul v20.8H, v20.8H, v1.H[4] // ................................................*.................. + mls v15.8H, v2.8H, v7.H[0] // .................................................*................. + sqrdmulh v9.8H, v4.8H, v0.H[7] // ..................................................*................ + mul v5.8H, v4.8H, v0.H[6] // ...................................................*............... + mls v20.8H, v29.8H, v7.H[0] // ....................................................*.............. + add v3.8H, v21.8H, v15.8H // .....................................................*............. + sub v4.8H, v21.8H, v15.8H // ......................................................*............ + mls v5.8H, v9.8H, v7.H[0] // .......................................................*........... + str q3, [x0, #256] // ........................................................*.......... + add v9.8H, v10.8H, v20.8H // .........................................................*......... + str q4, [x0, #320] // ..........................................................*........ + sub v29.8H, v10.8H, v20.8H // ...........................................................*....... + str q9, [x0, #384] // ............................................................*...... + add v24.8H, v6.8H, v5.8H // .............................................................*..... + str q29, [x0, #448] // ..............................................................*.... + sub v4.8H, v6.8H, v5.8H // ...............................................................*... + str q24, [x0], #(16) // ................................................................*.. + str q4, [x0, #48] // ..................................................................* // ------------------------ cycle (expected) ------------------------> // 0 25 50 // |------------------------|------------------------|---------------- - // sqrdmulh v30.8H, v19.8H, v0.H[1] // ......*............................................................ - // sqrdmulh v19.8H, v29.8H, v0.H[1] // .....*............................................................. - // mul v29.8H, v29.8H, v0.H[0] // ..*................................................................ - // sqrdmulh v23.8H, v2.8H, v0.H[1] // .*................................................................. - // mls v6.8H, v30.8H, v7.H[0] // ..........*........................................................ - // mul v30.8H, v2.8H, v0.H[0] // ...*............................................................... - // mls v29.8H, v19.8H, v7.H[0] // .........*......................................................... - // sqrdmulh v19.8H, v28.8H, v0.H[1] // *.................................................................. - // sub v28.8H, v20.8H, v6.8H // .....................*............................................. - // mls v30.8H, v23.8H, v7.H[0] // .......*........................................................... - // sub v23.8H, v27.8H, v29.8H // ...............*................................................... - // add v29.8H, v27.8H, v29.8H // ......................................*............................ - // add v6.8H, v20.8H, v6.8H // ...................*............................................... - // sub v2.8H, v15.8H, v30.8H // ..............*.................................................... - // add v30.8H, v15.8H, v30.8H // .............*..................................................... - // mls v3.8H, v19.8H, v7.H[0] // ....*.............................................................. - // sqrdmulh v19.8H, v2.8H, v0.H[5] // ..................*................................................ - // mul v2.8H, v2.8H, v0.H[4] // .................*................................................. - // sqrdmulh v20.8H, v30.8H, v0.H[3] // ......................*............................................ - // sub v27.8H, v21.8H, v3.8H // ........*.......................................................... - // add v3.8H, v21.8H, v3.8H // ...........................*....................................... - // mls v2.8H, v19.8H, v7.H[0] // .........................*......................................... - // sqrdmulh v19.8H, v27.8H, v0.H[5] // ............*...................................................... - // mul v27.8H, v27.8H, v0.H[4] // ...........*....................................................... - // mul v30.8H, v30.8H, v0.H[2] // ..........................*........................................ - // sub v15.8H, v28.8H, v2.8H // .............................*..................................... - // add v28.8H, v28.8H, v2.8H // ....................................................*.............. - // mls v27.8H, v19.8H, v7.H[0] // ................*.................................................. - // sqrdmulh v19.8H, v3.8H, v0.H[3] // ...............................*................................... - // mul v2.8H, v3.8H, v0.H[2] // .................................*................................. - // mls v30.8H, v20.8H, v7.H[0] // ..............................*.................................... - // sub v3.8H, v23.8H, v27.8H // ....................*.............................................. - // add v23.8H, v23.8H, v27.8H // ...........................................*....................... - // mls v2.8H, v19.8H, v7.H[0] // .....................................*............................. - // sub v19.8H, v6.8H, v30.8H // ..................................*................................ - // add v30.8H, v6.8H, v30.8H // ........................................*.......................... - // sqrdmulh v6.8H, v23.8H, v1.H[3] // ..................................................*................ - // sub v20.8H, v29.8H, v2.8H // .........................................*......................... - // add v29.8H, v29.8H, v2.8H // ..........................................*........................ - // mul v23.8H, v23.8H, v1.H[2] // ................................................*.................. - // sqrdmulh v2.8H, v20.8H, v1.H[1] // ............................................*...................... - // sqrdmulh v27.8H, v29.8H, v0.H[7] // ...............................................*................... - // mul v29.8H, v29.8H, v0.H[6] // ..............................................*.................... - // mul v20.8H, v20.8H, v1.H[0] // .............................................*..................... - // mls v23.8H, v6.8H, v7.H[0] // ......................................................*............ - // sqrdmulh v6.8H, v3.8H, v1.H[5] // .......................*........................................... - // mls v29.8H, v27.8H, v7.H[0] // ...................................................*............... - // mls v20.8H, v2.8H, v7.H[0] // .................................................*................. - // sub v2.8H, v28.8H, v23.8H // .............................................................*..... - // add v23.8H, v28.8H, v23.8H // ...............................................................*... - // sub v28.8H, v30.8H, v29.8H // ...........................................................*....... - // mul v3.8H, v3.8H, v1.H[4] // ........................*.......................................... - // add v30.8H, v30.8H, v29.8H // .........................................................*......... - // sub v29.8H, v19.8H, v20.8H // .......................................................*........... - // add v19.8H, v19.8H, v20.8H // .....................................................*............. - // mls v3.8H, v6.8H, v7.H[0] // ............................*...................................... - // str q30, [x0], #(16) // ............................................................*...... - // sub v30.8H, v15.8H, v3.8H // ................................*.................................. - // add v6.8H, v15.8H, v3.8H // ....................................*.............................. - // str q28, [x0, #48] // ..............................................................*.... - // str q19, [x0, #112] // ........................................................*.......... - // str q29, [x0, #176] // ..........................................................*........ - // str q23, [x0, #240] // ..................................................................* - // str q2, [x0, #304] // ................................................................*.. - // str q6, [x0, #368] // .......................................*........................... - // str q30, [x0, #432] // ...................................*............................... + // sqrdmulh v9.8H, v29.8H, v0.H[1] // .......*........................................................... + // sqrdmulh v29.8H, v5.8H, v0.H[1] // .....*............................................................. + // mul v5.8H, v5.8H, v0.H[0] // ...*............................................................... + // sqrdmulh v4.8H, v20.8H, v0.H[1] // ..*................................................................ + // mls v18.8H, v9.8H, v7.H[0] // ...............*................................................... + // mul v9.8H, v20.8H, v0.H[0] // .*................................................................. + // mls v5.8H, v29.8H, v7.H[0] // .........*......................................................... + // sqrdmulh v29.8H, v31.8H, v0.H[1] // *.................................................................. + // sub v31.8H, v30.8H, v18.8H // ...................*............................................... + // mls v9.8H, v4.8H, v7.H[0] // ......*............................................................ + // sub v4.8H, v2.8H, v5.8H // ...................................*............................... + // add v5.8H, v2.8H, v5.8H // .................*................................................. + // add v18.8H, v30.8H, v18.8H // .............................*..................................... + // sub v20.8H, v21.8H, v9.8H // ..........*........................................................ + // add v9.8H, v21.8H, v9.8H // ....................*.............................................. + // mls v15.8H, v29.8H, v7.H[0] // ....*.............................................................. + // sqrdmulh v29.8H, v20.8H, v0.H[5] // ..............*.................................................... + // mul v20.8H, v20.8H, v0.H[4] // .............*..................................................... + // sqrdmulh v30.8H, v9.8H, v0.H[3] // .........................*......................................... + // sub v2.8H, v22.8H, v15.8H // ............................*...................................... + // add v15.8H, v22.8H, v15.8H // ........*.......................................................... + // mls v20.8H, v29.8H, v7.H[0] // ..................*................................................ + // sqrdmulh v29.8H, v2.8H, v0.H[5] // ...............................*................................... + // mul v2.8H, v2.8H, v0.H[4] // ................................*.................................. + // mul v9.8H, v9.8H, v0.H[2] // ..........................*........................................ + // sub v21.8H, v31.8H, v20.8H // .......................*........................................... + // add v31.8H, v31.8H, v20.8H // ......................*............................................ + // mls v2.8H, v29.8H, v7.H[0] // ....................................*.............................. + // sqrdmulh v29.8H, v15.8H, v0.H[3] // ...........*....................................................... + // mul v20.8H, v15.8H, v0.H[2] // ............*...................................................... + // mls v9.8H, v30.8H, v7.H[0] // ..............................*.................................... + // sub v15.8H, v4.8H, v2.8H // ...........................................*....................... + // add v4.8H, v4.8H, v2.8H // .........................................*......................... + // mls v20.8H, v29.8H, v7.H[0] // ................*.................................................. + // sub v29.8H, v18.8H, v9.8H // ..................................*................................ + // add v9.8H, v18.8H, v9.8H // ..............................................*.................... + // sqrdmulh v18.8H, v4.8H, v1.H[3] // ............................................*...................... + // sub v30.8H, v5.8H, v20.8H // .....................*............................................. + // add v5.8H, v5.8H, v20.8H // .......................................*........................... + // mul v4.8H, v4.8H, v1.H[2] // .............................................*..................... + // sqrdmulh v20.8H, v30.8H, v1.H[1] // ...........................*....................................... + // sqrdmulh v2.8H, v5.8H, v0.H[7] // ..................................................*................ + // mul v5.8H, v5.8H, v0.H[6] // ...................................................*............... + // mul v30.8H, v30.8H, v1.H[0] // ........................*.......................................... + // mls v4.8H, v18.8H, v7.H[0] // .................................................*................. + // sqrdmulh v18.8H, v15.8H, v1.H[5] // ...............................................*................... + // mls v5.8H, v2.8H, v7.H[0] // .......................................................*........... + // mls v30.8H, v20.8H, v7.H[0] // .................................*................................. + // sub v20.8H, v31.8H, v4.8H // ......................................................*............ + // add v4.8H, v31.8H, v4.8H // .....................................................*............. + // sub v31.8H, v9.8H, v5.8H // ...............................................................*... + // mul v15.8H, v15.8H, v1.H[4] // ................................................*.................. + // add v9.8H, v9.8H, v5.8H // .............................................................*..... + // sub v5.8H, v29.8H, v30.8H // .....................................*............................. + // add v29.8H, v29.8H, v30.8H // ......................................*............................ + // mls v15.8H, v18.8H, v7.H[0] // ....................................................*.............. + // str q9, [x0], #(16) // ................................................................*.. + // sub v9.8H, v21.8H, v15.8H // ...........................................................*....... + // add v18.8H, v21.8H, v15.8H // .........................................................*......... + // str q31, [x0, #48] // ..................................................................* + // str q29, [x0, #112] // ..........................................*........................ + // str q5, [x0, #176] // ........................................*.......................... + // str q4, [x0, #240] // ........................................................*.......... + // str q20, [x0, #304] // ..........................................................*........ + // str q18, [x0, #368] // ............................................................*...... + // str q9, [x0, #432] // ..............................................................*.... mov in, inp mov count, #8 .p2align 2 - // Instructions: 9 - // Expected cycles: 13 - // Expected IPC: 0.69 - // - // Cycle bound: 13.0 - // IPC bound: 0.69 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q19, [x0, #48] // *............................. - ldr q9, [x3], #16 // ..*........................... - ldr q11, [x0, #32] // ....*......................... - sqrdmulh v30.8H, v19.8H, v9.H[1] // ......*....................... - mul v4.8H, v19.8H, v9.H[0] // .......*...................... - sqrdmulh v12.8H, v11.8H, v9.H[1] // ........*..................... - ldr q18, [x4, #32] // .........*.................... - mls v4.8H, v30.8H, v7.H[0] // ...........*.................. - ldr q0, [x4, #64] // ............*................. - - // ------ cycle (expected) ------> + // Instructions: 12 + // Expected cycles: 18 + // Expected IPC: 0.67 + // + // Cycle bound: 18.0 + // IPC bound: 0.67 + // + // Wall time: 0.02s + // User time: 0.02s + // + // ----- cycle (expected) ------> // 0 25 - // |------------------------|----- - // ldr q8, [x0, #48] // *.............................. - // ldr q9, [x3], #16 // ..*............................ - // sqrdmulh v31.8H, v8.8H, v9.H[1] // ......*........................ - // ldr q11, [x0, #32] // ....*.......................... - // sqrdmulh v12.8H, v11.8H, v9.H[1] // ........*...................... - // ldr q18, [x4, #32] // .........*..................... - // mul v4.8H, v8.8H, v9.H[0] // .......*....................... - // ldr q0, [x4, #64] // ............*.................. - // mls v4.8H, v31.8H, v7.H[0] // ...........*................... + // |------------------------|---- + ldr q29, [x0, #48] // *............................. + ldr q14, [x1], #16 // ..*........................... + ldr q5, [x0, #16] // ....*......................... + sqrdmulh v9.8H, v29.8H, v14.H[1] // ......*....................... + mul v29.8H, v29.8H, v14.H[0] // .......*...................... + ldr q13, [x0, #32] // ........*..................... + mls v29.8H, v9.8H, v7.H[0] // ...........*.................. + sqrdmulh v11.8H, v13.8H, v14.H[1] // ............*................. + ldr q16, [x2, #16] // .............*................ + sub v28.8H, v5.8H, v29.8H // ...............*.............. + add v12.8H, v5.8H, v29.8H // ................*............. + ldr q24, [x2, #80] // .................*............ + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q23, [x0, #48] // *.............................. + // ldr q21, [x0, #16] // ....*.......................... + // ldr q13, [x0, #32] // ........*...................... + // ldr q14, [x1], #16 // ..*............................ + // sqrdmulh v25.8H, v23.8H, v14.H[1] // ......*........................ + // mul v10.8H, v23.8H, v14.H[0] // .......*....................... + // mls v10.8H, v25.8H, v7.H[0] // ...........*................... + // sqrdmulh v11.8H, v13.8H, v14.H[1] // ............*.................. + // sub v28.8H, v21.8H, v10.8H // ...............*............... + // ldr q16, [x2, #16] // .............*................. + // add v12.8H, v21.8H, v10.8H // ................*.............. + // ldr q24, [x2, #80] // .................*............. sub count, count, #1 -layer4567_start: - // Instructions: 60 - // Expected cycles: 75 - // Expected IPC: 0.80 - // - // Cycle bound: 75.0 - // IPC bound: 0.80 - // - // Wall time: 112.33s - // User time: 112.33s - // - // ---------------------------- cycle (expected) ----------------------------> - // 0 25 50 - // |------------------------|------------------------| - ldr q20, [x4, #48] // *.......................................................................... - mul v24.8H, v11.8H, v9.H[0] // ..*........................................................................ - ldr q16, [x0, #16] // ...*....................................................................... - ldr q17, [x4, #80] // .....*..................................................................... - sub v1.8H, v16.8H, v4.8H // .......*................................................................... - add v23.8H, v16.8H, v4.8H // ........*.................................................................. - mls v24.8H, v12.8H, v7.H[0] // .........*................................................................. - sqrdmulh v4.8H, v1.8H, v9.H[5] // ..........*................................................................ - sqrdmulh v28.8H, v23.8H, v9.H[3] // ...........*............................................................... - mul v22.8H, v23.8H, v9.H[2] // ............*.............................................................. - mul v5.8H, v1.8H, v9.H[4] // .............*............................................................. - ldr q6, [x0, #0] // ..............*............................................................ - mls v22.8H, v28.8H, v7.H[0] // ................*.......................................................... - mls v5.8H, v4.8H, v7.H[0] // .................*......................................................... - ldr q4, [x4], #(6*16) // ..................*........................................................ - add v16.8H, v6.8H, v24.8H // ....................*...................................................... - sub v26.8H, v6.8H, v24.8H // .....................*..................................................... - ldr q10, [x4, #-80] // ......................*.................................................... - sub v19.8H, v16.8H, v22.8H // ........................*.................................................. - add v25.8H, v16.8H, v22.8H // .........................*................................................. - sub v14.8H, v26.8H, v5.8H // ..........................*................................................ - ldr q8, [x0, #112] // ...........................e............................................... - add v2.8H, v26.8H, v5.8H // .............................*............................................. - trn1 v12.4S, v25.4S, v19.4S // ..............................*............................................ - trn2 v26.4S, v25.4S, v19.4S // ...............................*........................................... - trn2 v5.4S, v2.4S, v14.4S // ................................*.......................................... - ldr q9, [x3], #16 // .................................e......................................... - trn2 v16.2D, v26.2D, v5.2D // ...................................*....................................... - trn1 v29.4S, v2.4S, v14.4S // ....................................*...................................... - trn1 v22.2D, v26.2D, v5.2D // .....................................*..................................... - mul v24.8H, v16.8H, v4.8H // ......................................*.................................... - sqrdmulh v5.8H, v16.8H, v10.8H // .......................................*................................... - trn2 v16.2D, v12.2D, v29.2D // ........................................*.................................. - sqrdmulh v31.8H, v8.8H, v9.H[1] // .........................................e................................. - mul v26.8H, v16.8H, v4.8H // ..........................................*................................ - sqrdmulh v4.8H, v16.8H, v10.8H // ...........................................*............................... - trn1 v16.2D, v12.2D, v29.2D // ............................................*.............................. - mls v24.8H, v5.8H, v7.H[0] // .............................................*............................. - ldr q11, [x0, #96] // ..............................................e............................ - mls v26.8H, v4.8H, v7.H[0] // ................................................*.......................... - add v5.8H, v22.8H, v24.8H // .................................................*......................... - sub v14.8H, v22.8H, v24.8H // ..................................................*........................ - sqrdmulh v12.8H, v11.8H, v9.H[1] // ...................................................e....................... - sqrdmulh v25.8H, v5.8H, v20.8H // ....................................................*...................... - mul v10.8H, v5.8H, v18.8H // .....................................................*..................... - add v22.8H, v16.8H, v26.8H // ......................................................*.................... - sqrdmulh v4.8H, v14.8H, v17.8H // .......................................................*................... - mul v5.8H, v14.8H, v0.8H // ........................................................*.................. - mls v10.8H, v25.8H, v7.H[0] // .........................................................*................. - ldr q18, [x4, #32] // ..........................................................e................ - sub v13.8H, v16.8H, v26.8H // ............................................................*.............. - mls v5.8H, v4.8H, v7.H[0] // .............................................................*............. - sub v25.8H, v22.8H, v10.8H // ..............................................................*............ - add v24.8H, v22.8H, v10.8H // ...............................................................*........... - mul v4.8H, v8.8H, v9.H[0] // ................................................................e.......... - sub v27.8H, v13.8H, v5.8H // .................................................................*......... - add v26.8H, v13.8H, v5.8H // ..................................................................*........ - ldr q0, [x4, #64] // ...................................................................e....... - st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x0], #64 // .....................................................................*..... - mls v4.8H, v31.8H, v7.H[0] // ..........................................................................e - - // ------------------------------------------------- cycle (expected) --------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|----------------- - // ldr q8, [x0, #(16*0)] // ................................................'.............*....................................................... - // ldr q9, [x0, #(16*1)] // ................................................'..*.................................................................. - // ldr q10, [x0, #(16*2)] // ...................e............................'.............................................~....................... - // ldr q11, [x0, #(16*3)] // e...............................................'..........................~.......................................... - // ldr q0, [x3], #16 // ......e.........................................'................................~.................................... - // sqrdmulh v27.8h, v10.8h, v0.h[1] // ........................e.......................'..................................................~.................. - // mul v24.8h, v10.8h, v0.h[0] // ................................................'.*................................................................... - // mls v24.8h, v27.8h, v7.h[0] // ................................................'........*............................................................ - // sub v10.8h, v8.8h, v24.8h // ................................................'....................*................................................ - // add v8.8h, v8.8h, v24.8h // ................................................'...................*................................................. - // sqrdmulh v27.8h, v11.8h, v0.h[1] // ..............e.................................'........................................~............................ - // mul v24.8h, v11.8h, v0.h[0] // .....................................e..........'...............................................................~..... - // mls v24.8h, v27.8h, v7.h[0] // ...............................................e'..................................................................... - // sub v11.8h, v9.8h, v24.8h // ................................................'......*.............................................................. - // add v9.8h, v9.8h, v24.8h // ................................................'.......*............................................................. - // sqrdmulh v27.8h, v9.8h, v0.h[3] // ................................................'..........*.......................................................... - // mul v24.8h, v9.8h, v0.h[2] // ................................................'...........*......................................................... - // mls v24.8h, v27.8h, v7.h[0] // ................................................'...............*..................................................... - // sub v9.8h, v8.8h, v24.8h // ................................................'.......................*............................................. - // add v8.8h, v8.8h, v24.8h // ................................................'........................*............................................ - // sqrdmulh v27.8h, v11.8h, v0.h[5] // ................................................'.........*........................................................... - // mul v24.8h, v11.8h, v0.h[4] // ................................................'............*........................................................ - // mls v24.8h, v27.8h, v7.h[0] // ................................................'................*.................................................... - // sub v11.8h, v10.8h, v24.8h // ................................................'.........................*........................................... - // add v10.8h, v10.8h, v24.8h // ..~.............................................'............................*........................................ - // trn1 v25.4s, v8.4s, v9.4s // ...~............................................'.............................*....................................... - // trn2 v26.4s, v8.4s, v9.4s // ....~...........................................'..............................*...................................... - // trn1 v27.4s, v10.4s, v11.4s // .........~......................................'...................................*................................. - // trn2 v28.4s, v10.4s, v11.4s // .....~..........................................'...............................*..................................... - // trn2 v10.2d, v25.2d, v27.2d // .............~..................................'.......................................*............................. - // trn2 v11.2d, v26.2d, v28.2d // ........~.......................................'..................................*.................................. - // trn1 v8.2d, v25.2d, v27.2d // .................~..............................'...........................................*......................... - // trn1 v9.2d, v26.2d, v28.2d // ..........~.....................................'....................................*................................ - // ldr q0, [x4], #(6*16) // ................................................'.................*................................................... - // ldr q4, [x4, #(-6*16 + 1*16)] // ................................................'.....................*............................................... - // ldr q1, [x4, #(-6*16 + 2*16)] // ...............................e................'.........................................................~........... - // ldr q5, [x4, #(-6*16 + 3*16)] // ................................................*..................................................................... - // ldr q2, [x4, #(-6*16 + 4*16)] // ........................................e.......'..................................................................~.. - // ldr q6, [x4, #(-6*16 + 5*16)] // ................................................'....*................................................................ - // sqrdmulh v27.8h, v10.8h, v4.8h // ................~...............................'..........................................*.......................... - // mul v24.8h, v10.8h, v0.8h // ...............~................................'.........................................*........................... - // mls v24.8h, v27.8h, v7.h[0] // .....................~..........................'...............................................*..................... - // sub v10.8h, v8.8h, v24.8h // .................................~..............'...........................................................*......... - // add v8.8h, v8.8h, v24.8h // ...........................~....................'.....................................................*............... - // sqrdmulh v27.8h, v11.8h, v4.8h // ............~...................................'......................................*.............................. - // mul v24.8h, v11.8h, v0.8h // ...........~....................................'.....................................*............................... - // mls v24.8h, v27.8h, v7.h[0] // ..................~.............................'............................................*........................ - // sub v11.8h, v9.8h, v24.8h // .......................~........................'.................................................*................... - // add v9.8h, v9.8h, v24.8h // ......................~.........................'................................................*.................... - // sqrdmulh v27.8h, v9.8h, v5.8h // .........................~......................'...................................................*................. - // mul v24.8h, v9.8h, v1.8h // ..........................~.....................'....................................................*................ - // mls v24.8h, v27.8h, v7.h[0] // ..............................~.................'........................................................*............ - // sub v9.8h, v8.8h, v24.8h // ...................................~............'.............................................................*....... - // add v8.8h, v8.8h, v24.8h // ....................................~...........'..............................................................*...... - // sqrdmulh v27.8h, v11.8h, v6.8h // ............................~...................'......................................................*.............. - // mul v24.8h, v11.8h, v2.8h // .............................~..................'.......................................................*............. - // mls v24.8h, v27.8h, v7.h[0] // ..................................~.............'............................................................*........ - // sub v11.8h, v10.8h, v24.8h // ......................................~.........'................................................................*.... - // add v10.8h, v10.8h, v24.8h // .......................................~........'.................................................................*... - // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0], #64 // ..........................................~.....'....................................................................* +layer3456_start: + // Instructions: 60 + // Expected cycles: 75 + // Expected IPC: 0.80 + // + // Cycle bound: 75.0 + // IPC bound: 0.80 + // + // Wall time: 131.35s + // User time: 131.35s + // + // ---------------------------- cycle (expected) ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q0, [x2], #(6*16) // *.......................................................................... + ldr q23, [x0, #112] // ..e........................................................................ + ldr q21, [x0, #80] // ....e...................................................................... + ldr q17, [x0, #0] // ......*.................................................................... + ldr q27, [x2, #-64] // ........*.................................................................. + sqrdmulh v26.8H, v12.8H, v14.H[3] // ..........*................................................................ + sqrdmulh v2.8H, v28.8H, v14.H[5] // ...........*............................................................... + mul v30.8H, v28.8H, v14.H[4] // ............*.............................................................. + mul v22.8H, v13.8H, v14.H[0] // .............*............................................................. + ldr q13, [x0, #96] // ..............e............................................................ + mls v30.8H, v2.8H, v7.H[0] // ................*.......................................................... + mls v22.8H, v11.8H, v7.H[0] // .................*......................................................... + mul v3.8H, v12.8H, v14.H[2] // ..................*........................................................ + ldr q14, [x1], #16 // ...................e....................................................... + sub v2.8H, v17.8H, v22.8H // .....................*..................................................... + mls v3.8H, v26.8H, v7.H[0] // ......................*.................................................... + add v22.8H, v17.8H, v22.8H // .......................*................................................... + add v4.8H, v2.8H, v30.8H // ........................*.................................................. + sub v6.8H, v2.8H, v30.8H // .........................*................................................. + add v26.8H, v22.8H, v3.8H // ..........................*................................................ + sub v22.8H, v22.8H, v3.8H // ...........................*............................................... + trn2 v29.4S, v4.4S, v6.4S // ............................*.............................................. + sqrdmulh v25.8H, v23.8H, v14.H[1] // .............................e............................................. + trn2 v1.4S, v26.4S, v22.4S // ..............................*............................................ + mul v10.8H, v23.8H, v14.H[0] // ...............................e........................................... + trn1 v18.4S, v26.4S, v22.4S // ................................*.......................................... + trn2 v17.2D, v1.2D, v29.2D // .................................*......................................... + trn1 v2.2D, v1.2D, v29.2D // ..................................*........................................ + trn1 v29.4S, v4.4S, v6.4S // ...................................*....................................... + sqrdmulh v4.8H, v17.8H, v16.8H // ....................................*...................................... + mls v10.8H, v25.8H, v7.H[0] // .....................................e..................................... + trn2 v30.2D, v18.2D, v29.2D // ......................................*.................................... + sqrdmulh v11.8H, v13.8H, v14.H[1] // .......................................e................................... + sqrdmulh v23.8H, v30.8H, v16.8H // ........................................*.................................. + mul v16.8H, v17.8H, v0.8H // .........................................*................................. + mul v1.8H, v30.8H, v0.8H // ..........................................*................................ + trn1 v30.2D, v18.2D, v29.2D // ...........................................*............................... + sub v28.8H, v21.8H, v10.8H // ............................................e.............................. + mls v16.8H, v4.8H, v7.H[0] // .............................................*............................. + mls v1.8H, v23.8H, v7.H[0] // ..............................................*............................ + ldr q8, [x2, #-32] // ...............................................*........................... + sub v22.8H, v2.8H, v16.8H // .................................................*......................... + ldr q9, [x2, #-48] // ..................................................*........................ + add v25.8H, v30.8H, v1.8H // ....................................................*...................... + add v5.8H, v2.8H, v16.8H // .....................................................*..................... + sqrdmulh v16.8H, v22.8H, v24.8H // ......................................................*.................... + mul v2.8H, v22.8H, v8.8H // .......................................................*................... + mul v26.8H, v5.8H, v27.8H // ........................................................*.................. + sub v29.8H, v30.8H, v1.8H // .........................................................*................. + sqrdmulh v30.8H, v5.8H, v9.8H // ..........................................................*................ + mls v2.8H, v16.8H, v7.H[0] // ...........................................................*............... + ldr q16, [x2, #16] // ............................................................e.............. + mls v26.8H, v30.8H, v7.H[0] // ..............................................................*............ + add v12.8H, v21.8H, v10.8H // ...............................................................e........... + sub v3.8H, v29.8H, v2.8H // ................................................................*.......... + add v2.8H, v29.8H, v2.8H // .................................................................*......... + add v0.8H, v25.8H, v26.8H // ..................................................................*........ + sub v1.8H, v25.8H, v26.8H // ...................................................................*....... + ldr q24, [x2, #80] // ....................................................................e...... + st4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x0], #64 // ......................................................................*.... + + // -------------------------------------------------------------- cycle (expected) ---------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q8, [x0, #(16*0)] // ....~....................................................................'.....*................................................................ + // ldr q9, [x0, #(16*1)] // ..e......................................................................'...~.................................................................. + // ldr q10, [x0, #(16*2)] // ............e............................................................'.............~........................................................ + // ldr q11, [x0, #(16*3)] // e........................................................................'.~.................................................................... + // ldr q0, [x1], #16 // .................e.......................................................'..................~................................................... + // sqrdmulh v27.8h, v10.8h, v0.h[1] // .....................................e...................................'......................................~............................... + // mul v24.8h, v10.8h, v0.h[0] // ...........~.............................................................'............*......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ...............~.........................................................'................*..................................................... + // sub v10.8h, v8.8h, v24.8h // ...................~.....................................................'....................*................................................. + // add v8.8h, v8.8h, v24.8h // .....................~...................................................'......................*............................................... + // sqrdmulh v27.8h, v11.8h, v0.h[1] // ...........................e.............................................'............................~......................................... + // mul v24.8h, v11.8h, v0.h[0] // .............................e...........................................'..............................~....................................... + // mls v24.8h, v27.8h, v7.h[0] // ...................................e.....................................'....................................~................................. + // sub v11.8h, v9.8h, v24.8h // ..........................................e..............................'...........................................~.......................... + // add v9.8h, v9.8h, v24.8h // .............................................................e...........'..............................................................~....... + // sqrdmulh v27.8h, v9.8h, v0.h[3] // ........~................................................................'.........*............................................................ + // mul v24.8h, v9.8h, v0.h[2] // ................~........................................................'.................*.................................................... + // mls v24.8h, v27.8h, v7.h[0] // ....................~....................................................'.....................*................................................ + // sub v9.8h, v8.8h, v24.8h // .........................~...............................................'..........................*........................................... + // add v8.8h, v8.8h, v24.8h // ........................~................................................'.........................*............................................ + // sqrdmulh v27.8h, v11.8h, v0.h[5] // .........~...............................................................'..........*........................................................... + // mul v24.8h, v11.8h, v0.h[4] // ..........~..............................................................'...........*.......................................................... + // mls v24.8h, v27.8h, v7.h[0] // ..............~..........................................................'...............*...................................................... + // sub v11.8h, v10.8h, v24.8h // .......................~.................................................'........................*............................................. + // add v10.8h, v10.8h, v24.8h // ......................~..................................................'.......................*.............................................. + // trn1 v25.4s, v8.4s, v9.4s // ..............................~..........................................'...............................*...................................... + // trn2 v26.4s, v8.4s, v9.4s // ............................~............................................'.............................*........................................ + // trn1 v27.4s, v10.4s, v11.4s // .................................~.......................................'..................................*................................... + // trn2 v28.4s, v10.4s, v11.4s // ..........................~..............................................'...........................*.......................................... + // trn2 v10.2d, v25.2d, v27.2d // ....................................~....................................'.....................................*................................ + // trn2 v11.2d, v26.2d, v28.2d // ...............................~.........................................'................................*..................................... + // trn1 v8.2d, v25.2d, v27.2d // .........................................~...............................'..........................................*........................... + // trn1 v9.2d, v26.2d, v28.2d // ................................~........................................'.................................*.................................... + // ldr q0, [x2], #(6*16) // .........................................................................*...................................................................... + // ldr q4, [x2, #(-6*16 + 1*16)] // ..........................................................e..............'...........................................................~.......... + // ldr q1, [x2, #(-6*16 + 2*16)] // ......~..................................................................'.......*.............................................................. + // ldr q5, [x2, #(-6*16 + 3*16)] // ................................................~........................'.................................................*.................... + // ldr q2, [x2, #(-6*16 + 4*16)] // .............................................~...........................'..............................................*....................... + // ldr q6, [x2, #(-6*16 + 5*16)] // ..................................................................e......'...................................................................~.. + // sqrdmulh v27.8h, v10.8h, v4.8h // ......................................~..................................'.......................................*.............................. + // mul v24.8h, v10.8h, v0.8h // ........................................~................................'.........................................*............................ + // mls v24.8h, v27.8h, v7.h[0] // ............................................~............................'.............................................*........................ + // sub v10.8h, v8.8h, v24.8h // .......................................................~.................'........................................................*............. + // add v8.8h, v8.8h, v24.8h // ..................................................~......................'...................................................*.................. + // sqrdmulh v27.8h, v11.8h, v4.8h // ..................................~......................................'...................................*.................................. + // mul v24.8h, v11.8h, v0.8h // .......................................~.................................'........................................*............................. + // mls v24.8h, v27.8h, v7.h[0] // ...........................................~.............................'............................................*......................... + // sub v11.8h, v9.8h, v24.8h // ...............................................~.........................'................................................*..................... + // add v9.8h, v9.8h, v24.8h // ...................................................~.....................'....................................................*................. + // sqrdmulh v27.8h, v9.8h, v5.8h // ........................................................~................'.........................................................*............ + // mul v24.8h, v9.8h, v1.8h // ......................................................~..................'.......................................................*.............. + // mls v24.8h, v27.8h, v7.h[0] // ............................................................~............'.............................................................*........ + // sub v9.8h, v8.8h, v24.8h // .................................................................~.......'..................................................................*... + // add v8.8h, v8.8h, v24.8h // ................................................................~........'.................................................................*.... + // sqrdmulh v27.8h, v11.8h, v6.8h // ....................................................~....................'.....................................................*................ + // mul v24.8h, v11.8h, v2.8h // .....................................................~...................'......................................................*............... + // mls v24.8h, v27.8h, v7.h[0] // .........................................................~...............'..........................................................*........... + // sub v11.8h, v10.8h, v24.8h // ..............................................................~..........'...............................................................*...... + // add v10.8h, v10.8h, v24.8h // ...............................................................~.........'................................................................*..... + // st4 {v8.4S, v9.4S, v10.4S, v11.4S}, [x0], #64 // ....................................................................~....'.....................................................................* sub count, count, #1 - cbnz count, layer4567_start - // Instructions: 51 - // Expected cycles: 59 - // Expected IPC: 0.86 + cbnz count, layer3456_start + // Instructions: 48 + // Expected cycles: 55 + // Expected IPC: 0.87 // - // Cycle bound: 59.0 - // IPC bound: 0.86 + // Cycle bound: 55.0 + // IPC bound: 0.87 // - // Wall time: 5.53s - // User time: 5.53s + // Wall time: 5.22s + // User time: 5.22s // - // -------------------- cycle (expected) --------------------> + // ------------------ cycle (expected) ------------------> // 0 25 50 - // |------------------------|------------------------|-------- - ldr q23, [x0, #16] // *.......................................................... - ldr q19, [x0, #0] // ..*........................................................ - sub v30.8H, v23.8H, v4.8H // ....*...................................................... - mul v28.8H, v11.8H, v9.H[0] // .....*..................................................... - add v23.8H, v23.8H, v4.8H // ......*.................................................... - mul v29.8H, v30.8H, v9.H[4] // .......*................................................... - sqrdmulh v30.8H, v30.8H, v9.H[5] // ........*.................................................. - mls v28.8H, v12.8H, v7.H[0] // .........*................................................. - mul v6.8H, v23.8H, v9.H[2] // ..........*................................................ - sqrdmulh v2.8H, v23.8H, v9.H[3] // ...........*............................................... - mls v29.8H, v30.8H, v7.H[0] // ............*.............................................. - sub v30.8H, v19.8H, v28.8H // .............*............................................. - add v28.8H, v19.8H, v28.8H // ..............*............................................ - mls v6.8H, v2.8H, v7.H[0] // ...............*........................................... - sub v19.8H, v30.8H, v29.8H // ................*.......................................... - add v30.8H, v30.8H, v29.8H // .................*......................................... - ldr q20, [x4], #(6*16) // ..................*........................................ - add v23.8H, v28.8H, v6.8H // ....................*...................................... - sub v29.8H, v28.8H, v6.8H // .....................*..................................... - trn1 v1.4S, v30.4S, v19.4S // ......................*.................................... - trn2 v6.4S, v30.4S, v19.4S // .......................*................................... - trn2 v28.4S, v23.4S, v29.4S // ........................*.................................. - ldr q2, [x4, #-80] // .........................*................................. - trn2 v30.2D, v28.2D, v6.2D // ...........................*............................... - trn1 v3.4S, v23.4S, v29.4S // ............................*.............................. - sqrdmulh v19.8H, v30.8H, v2.8H // .............................*............................. - mul v29.8H, v30.8H, v20.8H // ..............................*............................ - ldr q9, [x4, #-48] // ...............................*........................... - trn2 v30.2D, v3.2D, v1.2D // .................................*......................... - mls v29.8H, v19.8H, v7.H[0] // ..................................*........................ - sqrdmulh v23.8H, v30.8H, v2.8H // ...................................*....................... - trn1 v2.2D, v28.2D, v6.2D // ....................................*...................... - mul v28.8H, v30.8H, v20.8H // .....................................*..................... - sub v30.8H, v2.8H, v29.8H // ......................................*.................... - ldr q20, [x4, #-16] // .......................................*................... - mul v19.8H, v30.8H, v0.8H // .........................................*................. - add v29.8H, v2.8H, v29.8H // ..........................................*................ - sqrdmulh v30.8H, v30.8H, v20.8H // ...........................................*............... - mls v28.8H, v23.8H, v7.H[0] // ............................................*.............. - mul v23.8H, v29.8H, v18.8H // .............................................*............. - sqrdmulh v29.8H, v29.8H, v9.8H // ..............................................*............ - trn1 v6.2D, v3.2D, v1.2D // ...............................................*........... - mls v19.8H, v30.8H, v7.H[0] // ................................................*.......... - sub v30.8H, v6.8H, v28.8H // .................................................*......... - mls v23.8H, v29.8H, v7.H[0] // ..................................................*........ - add v29.8H, v6.8H, v28.8H // ...................................................*....... - add v20.8H, v30.8H, v19.8H // ....................................................*...... - sub v21.8H, v30.8H, v19.8H // .....................................................*..... - add v18.8H, v29.8H, v23.8H // ......................................................*.... - sub v19.8H, v29.8H, v23.8H // .......................................................*... - st4 {v18.4S, v19.4S, v20.4S, v21.4S}, [x0], #64 // ..........................................................* - - // -------------------- cycle (expected) --------------------> - // 0 25 50 - // |------------------------|------------------------|-------- - // ldr q20, [x4, #48] // ...............................*........................... - // mul v24.8H, v11.8H, v9.H[0] // .....*..................................................... - // ldr q16, [x0, #16] // *.......................................................... - // ldr q17, [x4, #80] // .......................................*................... - // sub v1.8H, v16.8H, v4.8H // ....*...................................................... - // add v23.8H, v16.8H, v4.8H // ......*.................................................... - // mls v24.8H, v12.8H, v7.H[0] // .........*................................................. - // sqrdmulh v4.8H, v1.8H, v9.H[5] // ........*.................................................. - // sqrdmulh v28.8H, v23.8H, v9.H[3] // ...........*............................................... - // mul v22.8H, v23.8H, v9.H[2] // ..........*................................................ - // mul v5.8H, v1.8H, v9.H[4] // .......*................................................... - // ldr q6, [x0, #0] // ..*........................................................ - // mls v22.8H, v28.8H, v7.H[0] // ...............*........................................... - // mls v5.8H, v4.8H, v7.H[0] // ............*.............................................. - // ldr q4, [x4], #(6*16) // ..................*........................................ - // add v16.8H, v6.8H, v24.8H // ..............*............................................ - // sub v26.8H, v6.8H, v24.8H // .............*............................................. - // ldr q10, [x4, #-80] // .........................*................................. - // sub v19.8H, v16.8H, v22.8H // .....................*..................................... - // add v25.8H, v16.8H, v22.8H // ....................*...................................... - // sub v14.8H, v26.8H, v5.8H // ................*.......................................... - // add v2.8H, v26.8H, v5.8H // .................*......................................... - // trn1 v12.4S, v25.4S, v19.4S // ............................*.............................. - // trn2 v26.4S, v25.4S, v19.4S // ........................*.................................. - // trn2 v5.4S, v2.4S, v14.4S // .......................*................................... - // trn2 v16.2D, v26.2D, v5.2D // ...........................*............................... - // trn1 v29.4S, v2.4S, v14.4S // ......................*.................................... - // trn1 v22.2D, v26.2D, v5.2D // ....................................*...................... - // mul v24.8H, v16.8H, v4.8H // ..............................*............................ - // sqrdmulh v5.8H, v16.8H, v10.8H // .............................*............................. - // trn2 v16.2D, v12.2D, v29.2D // .................................*......................... - // mul v26.8H, v16.8H, v4.8H // .....................................*..................... - // sqrdmulh v4.8H, v16.8H, v10.8H // ...................................*....................... - // trn1 v16.2D, v12.2D, v29.2D // ...............................................*........... - // mls v24.8H, v5.8H, v7.H[0] // ..................................*........................ - // mls v26.8H, v4.8H, v7.H[0] // ............................................*.............. - // add v5.8H, v22.8H, v24.8H // ..........................................*................ - // sub v14.8H, v22.8H, v24.8H // ......................................*.................... - // sqrdmulh v25.8H, v5.8H, v20.8H // ..............................................*............ - // mul v10.8H, v5.8H, v18.8H // .............................................*............. - // add v22.8H, v16.8H, v26.8H // ...................................................*....... - // sqrdmulh v4.8H, v14.8H, v17.8H // ...........................................*............... - // mul v5.8H, v14.8H, v0.8H // .........................................*................. - // mls v10.8H, v25.8H, v7.H[0] // ..................................................*........ - // sub v13.8H, v16.8H, v26.8H // .................................................*......... - // mls v5.8H, v4.8H, v7.H[0] // ................................................*.......... - // sub v25.8H, v22.8H, v10.8H // .......................................................*... - // add v24.8H, v22.8H, v10.8H // ......................................................*.... - // sub v27.8H, v13.8H, v5.8H // .....................................................*..... - // add v26.8H, v13.8H, v5.8H // ....................................................*...... - // st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x0], #64 // ..........................................................* + // |------------------------|------------------------|---- + mul v18.8H, v28.8H, v14.H[4] // *...................................................... + sqrdmulh v15.8H, v28.8H, v14.H[5] // .*..................................................... + mul v5.8H, v13.8H, v14.H[0] // ..*.................................................... + ldr q9, [x0, #0] // ...*................................................... + mls v18.8H, v15.8H, v7.H[0] // .....*................................................. + mls v5.8H, v11.8H, v7.H[0] // ......*................................................ + mul v10.8H, v12.8H, v14.H[2] // .......*............................................... + sqrdmulh v29.8H, v12.8H, v14.H[3] // ........*.............................................. + ldr q2, [x2, #32] // .........*............................................. + sub v4.8H, v9.8H, v5.8H // ...........*........................................... + mls v10.8H, v29.8H, v7.H[0] // ............*.......................................... + add v29.8H, v9.8H, v5.8H // .............*......................................... + sub v21.8H, v4.8H, v18.8H // ..............*........................................ + add v9.8H, v4.8H, v18.8H // ...............*....................................... + add v18.8H, v29.8H, v10.8H // ................*...................................... + sub v0.8H, v29.8H, v10.8H // .................*..................................... + trn2 v20.4S, v9.4S, v21.4S // ..................*.................................... + trn1 v31.4S, v9.4S, v21.4S // ...................*................................... + trn2 v9.4S, v18.4S, v0.4S // ....................*.................................. + ldr q10, [x2], #(6*16) // .....................*................................. + trn2 v29.2D, v9.2D, v20.2D // .......................*............................... + trn1 v20.2D, v9.2D, v20.2D // ........................*.............................. + mul v4.8H, v29.8H, v10.8H // .........................*............................. + sqrdmulh v5.8H, v29.8H, v16.8H // ..........................*............................ + ldr q15, [x2, #-32] // ...........................*........................... + trn1 v30.4S, v18.4S, v0.4S // .............................*......................... + mls v4.8H, v5.8H, v7.H[0] // ..............................*........................ + ldr q9, [x2, #-48] // ...............................*....................... + trn2 v3.2D, v30.2D, v31.2D // .................................*..................... + add v29.8H, v20.8H, v4.8H // ..................................*.................... + sub v5.8H, v20.8H, v4.8H // ...................................*................... + mul v18.8H, v3.8H, v10.8H // ....................................*.................. + sqrdmulh v26.8H, v3.8H, v16.8H // .....................................*................. + sqrdmulh v6.8H, v5.8H, v24.8H // ......................................*................ + mul v4.8H, v29.8H, v2.8H // .......................................*............... + mul v0.8H, v5.8H, v15.8H // ........................................*.............. + mls v18.8H, v26.8H, v7.H[0] // .........................................*............. + sqrdmulh v9.8H, v29.8H, v9.8H // ..........................................*............ + trn1 v31.2D, v30.2D, v31.2D // ...........................................*........... + mls v0.8H, v6.8H, v7.H[0] // ............................................*.......... + sub v5.8H, v31.8H, v18.8H // .............................................*......... + mls v4.8H, v9.8H, v7.H[0] // ..............................................*........ + add v20.8H, v31.8H, v18.8H // ...............................................*....... + sub v29.8H, v5.8H, v0.8H // ................................................*...... + add v28.8H, v5.8H, v0.8H // .................................................*..... + sub v27.8H, v20.8H, v4.8H // ..................................................*.... + add v26.8H, v20.8H, v4.8H // ...................................................*... + st4 {v26.4S, v27.4S, v28.4S, v29.4S}, [x0], #64 // ......................................................* + + // ------------------ cycle (expected) ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q0, [x2], #(6*16) // .....................*................................. + // ldr q17, [x0, #0] // ...*................................................... + // ldr q27, [x2, #-64] // .........*............................................. + // sqrdmulh v26.8H, v12.8H, v14.H[3] // ........*.............................................. + // sqrdmulh v2.8H, v28.8H, v14.H[5] // .*..................................................... + // mul v30.8H, v28.8H, v14.H[4] // *...................................................... + // mul v22.8H, v13.8H, v14.H[0] // ..*.................................................... + // mls v30.8H, v2.8H, v7.H[0] // .....*................................................. + // mls v22.8H, v11.8H, v7.H[0] // ......*................................................ + // mul v3.8H, v12.8H, v14.H[2] // .......*............................................... + // sub v2.8H, v17.8H, v22.8H // ...........*........................................... + // mls v3.8H, v26.8H, v7.H[0] // ............*.......................................... + // add v22.8H, v17.8H, v22.8H // .............*......................................... + // add v4.8H, v2.8H, v30.8H // ...............*....................................... + // sub v6.8H, v2.8H, v30.8H // ..............*........................................ + // add v26.8H, v22.8H, v3.8H // ................*...................................... + // sub v22.8H, v22.8H, v3.8H // .................*..................................... + // trn2 v29.4S, v4.4S, v6.4S // ..................*.................................... + // trn2 v1.4S, v26.4S, v22.4S // ....................*.................................. + // trn1 v18.4S, v26.4S, v22.4S // .............................*......................... + // trn2 v17.2D, v1.2D, v29.2D // .......................*............................... + // trn1 v2.2D, v1.2D, v29.2D // ........................*.............................. + // trn1 v29.4S, v4.4S, v6.4S // ...................*................................... + // sqrdmulh v4.8H, v17.8H, v16.8H // ..........................*............................ + // trn2 v30.2D, v18.2D, v29.2D // .................................*..................... + // sqrdmulh v23.8H, v30.8H, v16.8H // .....................................*................. + // mul v16.8H, v17.8H, v0.8H // .........................*............................. + // mul v1.8H, v30.8H, v0.8H // ....................................*.................. + // trn1 v30.2D, v18.2D, v29.2D // ...........................................*........... + // mls v16.8H, v4.8H, v7.H[0] // ..............................*........................ + // mls v1.8H, v23.8H, v7.H[0] // .........................................*............. + // ldr q8, [x2, #-32] // ...........................*........................... + // sub v22.8H, v2.8H, v16.8H // ...................................*................... + // ldr q9, [x2, #-48] // ...............................*....................... + // add v25.8H, v30.8H, v1.8H // ...............................................*....... + // add v5.8H, v2.8H, v16.8H // ..................................*.................... + // sqrdmulh v16.8H, v22.8H, v24.8H // ......................................*................ + // mul v2.8H, v22.8H, v8.8H // ........................................*.............. + // mul v26.8H, v5.8H, v27.8H // .......................................*............... + // sub v29.8H, v30.8H, v1.8H // .............................................*......... + // sqrdmulh v30.8H, v5.8H, v9.8H // ..........................................*............ + // mls v2.8H, v16.8H, v7.H[0] // ............................................*.......... + // mls v26.8H, v30.8H, v7.H[0] // ..............................................*........ + // sub v3.8H, v29.8H, v2.8H // ................................................*...... + // add v2.8H, v29.8H, v2.8H // .................................................*..... + // add v0.8H, v25.8H, v26.8H // ...................................................*... + // sub v1.8H, v25.8H, v26.8H // ..................................................*.... + // st4 {v0.4S, v1.4S, v2.4S, v3.4S}, [x0], #64 // ......................................................* pop_stack diff --git a/mlkem/native/aarch64/profiles/clean.h b/mlkem/native/aarch64/profiles/clean.h index e76247fa7..ff86a258d 100644 --- a/mlkem/native/aarch64/profiles/clean.h +++ b/mlkem/native/aarch64/profiles/clean.h @@ -22,10 +22,16 @@ #define MLKEM_USE_NATIVE_REJ_UNIFORM #define NTT_BOUND_NATIVE (6 * MLKEM_Q) -static inline void ntt_native(poly *data) { ntt_asm_clean(data->coeffs); } +static inline void ntt_native(poly *data) { + ntt_asm_clean(data->coeffs, aarch64_ntt_zetas_layer01234, + aarch64_ntt_zetas_layer56); +} #define INVNTT_BOUND_NATIVE (8 * MLKEM_Q) -static inline void intt_native(poly *data) { intt_asm_clean(data->coeffs); } +static inline void intt_native(poly *data) { + intt_asm_clean(data->coeffs, aarch64_invntt_zetas_layer01234, + aarch64_invntt_zetas_layer56); +} static inline void poly_reduce_native(poly *data) { poly_reduce_asm_clean(data->coeffs); diff --git a/mlkem/native/aarch64/profiles/opt.h b/mlkem/native/aarch64/profiles/opt.h index 6b637de09..3d20e234c 100644 --- a/mlkem/native/aarch64/profiles/opt.h +++ b/mlkem/native/aarch64/profiles/opt.h @@ -22,10 +22,16 @@ #define MLKEM_USE_NATIVE_REJ_UNIFORM #define NTT_BOUND_NATIVE (6 * MLKEM_Q) -static inline void ntt_native(poly *data) { ntt_asm_opt(data->coeffs); } +static inline void ntt_native(poly *data) { + ntt_asm_opt(data->coeffs, aarch64_ntt_zetas_layer01234, + aarch64_ntt_zetas_layer56); +} #define INVNTT_BOUND_NATIVE (8 * MLKEM_Q) -static inline void intt_native(poly *data) { intt_asm_opt(data->coeffs); } +static inline void intt_native(poly *data) { + intt_asm_opt(data->coeffs, aarch64_invntt_zetas_layer01234, + aarch64_invntt_zetas_layer56); +} static inline void poly_reduce_native(poly *data) { poly_reduce_asm_opt(data->coeffs); diff --git a/scripts/autogenerate_files.py b/scripts/autogenerate_files.py index 6d701d89d..da174a596 100644 --- a/scripts/autogenerate_files.py +++ b/scripts/autogenerate_files.py @@ -6,18 +6,15 @@ import argparse import os +modulus = 3329 +root_of_unity = 17 +montgomery_factor = pow(2, 16, modulus) + # This file re-generated auto-generated source files in mlkem-native. # # It currently covers: # - zeta values for the reference NTT and invNTT -def bitreverse(i,n): - r = 0 - for _ in range(n): - r = 2*r + (i & 1) - i >>= 1 - return r - def gen_header(): yield "// Copyright (c) 2024 The mlkem-native project authors" yield "// SPDX-License-Identifier: Apache-2.0" @@ -26,45 +23,216 @@ def gen_header(): yield "// Do not modify it directly." yield "" +def update_file(filename, content, dry_run=False): + + # Format content + p = subprocess.run(["clang-format"], capture_output=True, input=content, text=True) + if p.returncode != 0: + print(f"Failed to auto-format autogenerated code (clang-format return code {p.returncode}") + exit(1) + content = p.stdout + + if dry_run is False: + with open(filename, "w+") as f: + f.write(content) + else: + if os.path.exists(filename) is False: + print(f"Autogenerated file {filename} does not exist") + exit(1) + with open(filename, "r") as f: + current_content = f.read() + if current_content != content: + print(f"Autogenerated file {filename} needs updating. Have you called scripts/autogenerated.py?") + exit(1) + +def bitreverse(i,n): + r = 0 + for _ in range(n): + r = 2*r + (i & 1) + i >>= 1 + return r + +def signed_reduce(a): + """Return signed canonical representative of a mod b""" + c = a % modulus + if c >= modulus / 2: + c -= modulus + return c -def gen_zetas(): +def gen_c_zetas(): """Generate source and header file for zeta values used in the reference NTT and invNTT""" # The zeta values are the powers of the chosen root of unity (17), # converted to Montgomery form. - modulus = 3329 - root_of_unity = 17 - montgomery_factor = pow(2, 16, modulus) - - def signed_reduce(a): - """Return signed canonical representative of a mod b""" - c = a % modulus - if c >= modulus / 2: - c -= modulus - return c - zeta = [] for i in range(128): zeta.append(signed_reduce(pow(root_of_unity, i, modulus) * montgomery_factor)) # The source code stores the zeta table in bit reversed form - zeta_bitrev = [ zeta[bitreverse(i,7)] for i in range(128) ] - - yield "// Table of zeta values used in the reference NTT and inverse NTT." - yield "// See autogenerate_files.py for details." - yield "const int16_t zetas[128] = {" - for i in range(0,128): - yield str(zeta_bitrev[i]) + "," - yield "};" - -def gen_zeta_file(): - yield from gen_header() - yield "#include \"ntt.h\"" - yield "" - yield from gen_zetas() - yield "" + yield from (zeta[bitreverse(i,7)] for i in range(128)) + +def gen_c_zeta_file(dry_run=False): + def gen(): + yield from gen_header() + yield "#include \"ntt.h\"" + yield "" + yield "// Table of zeta values used in the reference NTT and inverse NTT." + yield "// See autogenerate_files.py for details." + yield "const int16_t zetas[128] = {" + yield from map(lambda t: str(t) + ",", gen_c_zetas()) + yield "};" + yield "" + update_file("mlkem/zetas.c", '\n'.join(gen()), dry_run=dry_run) + +def prepare_root_for_barrett(root): + """Takes a constant that the code needs to Barrett-multiply with, + and returns the pair of (a) its signed canonical form, (b) the + twisted constant used in the high-mul part of the Barrett multiplication.""" + + # Signed canonical reduction + root = signed_reduce(root) + + def round_to_even(t): + rt = round(t) + if rt % 2 == 0: + return rt + # Make sure to pick a rounding target + # that's <= 1 away from x in absolute value. + if rt <= t: + return rt + 1 + return rt - 1 + + root_twisted = round_to_even((root * 2**16) / modulus) // 2 + return root, root_twisted + +def gen_aarch64_root_of_unity_for_block(layer, block, inv=False): + # We are computing a negacyclic NTT; the twiddles needed here is + # the second half of the twiddles for a cyclic NTT of twice the size. + log = bitreverse(pow(2,layer) + block, 7) + if inv is True: + log = -log + root, root_twisted = prepare_root_for_barrett(pow(root_of_unity, log, modulus)) + return root, root_twisted + +def gen_aarch64_fwd_ntt_zetas_layer01234(): + # Layers 0,1,2 are merged + yield from gen_aarch64_root_of_unity_for_block(0,0) + yield from gen_aarch64_root_of_unity_for_block(1,0) + yield from gen_aarch64_root_of_unity_for_block(1,1) + yield from gen_aarch64_root_of_unity_for_block(2,0) + yield from gen_aarch64_root_of_unity_for_block(2,1) + yield from gen_aarch64_root_of_unity_for_block(2,2) + yield from gen_aarch64_root_of_unity_for_block(2,3) + yield from (0,0) # Padding + + # Layers 3,4,5,6 are merged, but we emit roots for 3,4 + # in separate arrays than those for 5,6 + for block in range(8): # There are 8 blocks in Layer 4 + yield from gen_aarch64_root_of_unity_for_block(3,block) + yield from gen_aarch64_root_of_unity_for_block(4,2*block+0) + yield from gen_aarch64_root_of_unity_for_block(4,2*block+1) + yield from (0,0) # Padding + +def gen_aarch64_fwd_ntt_zetas_layer56(): + # Layers 3,4,5,6 are merged, but we emit roots for 3,4 + # in separate arrays than those for 5,6 + for block in range(8): + def double_ith(t, i): + yield from (t[i], t[i]) + # Ordering of blocks is adjusted to suit the transposed internal + # presentation of the data + for i in range(2): + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+0), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+1), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+2), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+3), i) + for i in range(2): + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+0), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+2), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+4), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+6), i) + for i in range(2): + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+1), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+3), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+5), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+7), i) + +def gen_aarch64_inv_ntt_zetas_layer01234(): + # Layers 3,4,5,6 are merged, but we emit roots for 3,4 + # in separate arrays than those for 5,6 + for block in range(8): # There are 8 blocks in Layer 4 + yield from gen_aarch64_root_of_unity_for_block(3,block,inv=True) + yield from gen_aarch64_root_of_unity_for_block(4,2*block+0,inv=True) + yield from gen_aarch64_root_of_unity_for_block(4,2*block+1,inv=True) + yield from (0,0) # Padding + + # Layers 0,1,2 are merged + yield from gen_aarch64_root_of_unity_for_block(0,0,inv=True) + yield from gen_aarch64_root_of_unity_for_block(1,0,inv=True) + yield from gen_aarch64_root_of_unity_for_block(1,1,inv=True) + yield from gen_aarch64_root_of_unity_for_block(2,0,inv=True) + yield from gen_aarch64_root_of_unity_for_block(2,1,inv=True) + yield from gen_aarch64_root_of_unity_for_block(2,2,inv=True) + yield from gen_aarch64_root_of_unity_for_block(2,3,inv=True) + yield from (0,0) # Padding + +def gen_aarch64_inv_ntt_zetas_layer56(): + # Layers 3,4,5,6 are merged, but we emit roots for 3,4 + # in separate arrays than those for 5,6 + for block in range(8): + def double_ith(t, i): + yield from (t[i], t[i]) + # Ordering of blocks is adjusted to suit the transposed internal + # presentation of the data + for i in range(2): + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+0, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+1, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+2, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(5,4*block+3, inv=True), i) + for i in range(2): + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+0, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+2, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+4, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+6, inv=True), i) + for i in range(2): + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+1, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+3, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+5, inv=True), i) + yield from double_ith(gen_aarch64_root_of_unity_for_block(6,8*block+7, inv=True), i) + +def gen_aarch64_fwd_ntt_zeta_file(dry_run=False): + def gen(): + yield from gen_header() + yield "#include \"arith_native_aarch64.h\"" + yield "" + yield "#ifdef MLKEM_USE_NATIVE_AARCH64" + yield "" + yield "// Table of zeta values used in the AArch64 forward NTT" + yield "// See autogenerate_files.py for details." + yield "const int16_t aarch64_ntt_zetas_layer01234[] = {" + yield from map(lambda t: str(t) + ",", gen_aarch64_fwd_ntt_zetas_layer01234()) + yield "};" + yield "" + yield "const int16_t aarch64_ntt_zetas_layer56[] = {" + yield from map(lambda t: str(t) + ",", gen_aarch64_fwd_ntt_zetas_layer56()) + yield "};" + yield "" + yield "const int16_t aarch64_invntt_zetas_layer01234[] = {" + yield from map(lambda t: str(t) + ",", gen_aarch64_inv_ntt_zetas_layer01234()) + yield "};" + yield "" + yield "const int16_t aarch64_invntt_zetas_layer56[] = {" + yield from map(lambda t: str(t) + ",", gen_aarch64_inv_ntt_zetas_layer56()) + yield "};" + yield "" + yield "#else /* MLKEM_USE_NATIVE_AARCH64 */" + yield "// Dummy declaration for compilers disliking empty compilation units" + yield "int empty_cu_aarch64_zetas;" + yield "#endif /* MLKEM_USE_NATIVE_AARCH64 */" + yield "" + update_file("mlkem/native/aarch64/aarch64_zetas.c", '\n'.join(gen()), dry_run=dry_run) def _main(): parser = argparse.ArgumentParser( @@ -72,33 +240,8 @@ def _main(): parser.add_argument("--dry-run", default=False, action='store_true') args = parser.parse_args() - - base = "mlkem" - zeta_source = f"{base}/zetas.c" - - file_content = '\n'.join(gen_zeta_file()) - p = subprocess.run(["clang-format"], capture_output=True, input=file_content, text=True) - if p.returncode != 0: - print(f"Failed to auto-format autogenerated code (clang-format return code {p.returncode}") - exit(1) - file_content = p.stdout - - if args.dry_run is False: - with open(zeta_source, "w+") as f: - f.write('\n'.join(gen_zeta_file())) - else: - if os.path.exists(zeta_source) is False: - print(f"Autogenerated file {zeta_source} does not exist") - exit(1) - with open(zeta_source, "r") as f: - current_content = f.read() - if current_content != file_content: - print(f"Autogenerated file {zeta_source} needs updating. Have you called scripts/autogenerated.py?") - exit(1) - - # Auto-format using clang-format, so we don't need to - # bother about manual formatting above - subprocess.run(["clang-format", "-i", zeta_source]) + gen_c_zeta_file(args.dry_run) + gen_aarch64_fwd_ntt_zeta_file(args.dry_run) if __name__ == "__main__": _main()