From 0c296a019ab187016d5a08ee914c571c2f518bd7 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Fri, 26 Jul 2024 17:12:28 +0800 Subject: [PATCH] ntt-neon test --- Makefile | 1 + tests/ntt-dilithium/ntt-dilithium.mk | 3 - tests/{ntt_neon => ntt-neon}/main.c | 1 + tests/ntt-neon/ntt-neon.mk | 96 + tests/{ntt_neon => ntt-neon}/ntt.c | 0 tests/{ntt_neon => ntt-neon}/ntt.h | 0 ...t_u32_full_33556993_28678040_var_4_4_0_0.s | 2422 ---------------- ..._u32_full_33556993_28678040_var_4_4_10_0.s | 2486 ---------------- ..._u32_full_33556993_28678040_var_4_4_11_0.s | 2422 ---------------- ..._u32_full_33556993_28678040_var_4_4_12_0.s | 2422 ---------------- ..._u32_full_33556993_28678040_var_4_4_13_0.s | 2422 ---------------- ..._u32_full_33556993_28678040_var_4_4_14_0.s | 2506 ----------------- ..._u32_full_33556993_28678040_var_4_4_15_0.s | 2506 ----------------- ..._u32_full_33556993_28678040_var_4_4_16_0.s | 2506 ----------------- ..._u32_full_33556993_28678040_var_4_4_17_0.s | 2486 ---------------- ..._u32_full_33556993_28678040_var_4_4_18_0.s | 2486 ---------------- ...t_u32_full_33556993_28678040_var_4_4_1_0.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_2_0.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_3_0.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z2_0.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z2_1.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z2_2.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z2_3.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z2_4.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z2_5.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z4_0.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z4_1.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z4_2.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z4_3.s | 2422 ---------------- ...32_full_33556993_28678040_var_4_4_3_z4_4.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_4_0.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_5_0.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_6_0.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_7_0.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_8_0.s | 2422 ---------------- ...t_u32_full_33556993_28678040_var_4_4_9_0.s | 2422 ---------------- ...2_incomplete_33556993_28678040_var_3_3_0.s | 1474 ---------- ...2_incomplete_33556993_28678040_var_3_3_1.s | 1474 ---------- ...2_incomplete_33556993_28678040_var_3_3_2.s | 1474 ---------- ...2_incomplete_33556993_28678040_var_3_3_3.s | 1474 ---------- ...2_incomplete_33556993_28678040_var_3_3_4.s | 1474 ---------- ...2_incomplete_33556993_28678040_var_3_3_5.s | 1474 ---------- ...incomplete_33556993_28678040_var_4_2_0_0.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_0_z4_0.s | 1494 ---------- ...mplete_33556993_28678040_var_4_2_0_z4_16.s | 1494 ---------- ...mplete_33556993_28678040_var_4_2_10_z4_7.s | 1558 ---------- ...mplete_33556993_28678040_var_4_2_11_z4_7.s | 1494 ---------- ...mplete_33556993_28678040_var_4_2_12_z4_7.s | 1494 ---------- ...mplete_33556993_28678040_var_4_2_13_z4_7.s | 1494 ---------- ...mplete_33556993_28678040_var_4_2_14_z4_7.s | 1578 ----------- ...mplete_33556993_28678040_var_4_2_15_z4_7.s | 1578 ----------- ...mplete_33556993_28678040_var_4_2_16_z4_7.s | 1578 ----------- ...mplete_33556993_28678040_var_4_2_17_z4_7.s | 1558 ---------- ...mplete_33556993_28678040_var_4_2_18_z4_7.s | 1558 ---------- ...mplete_33556993_28678040_var_4_2_19_z4_7.s | 1558 ---------- ...mplete_33556993_28678040_var_4_2_20_z4_7.s | 1558 ---------- ...mplete_33556993_28678040_var_4_2_21_z4_7.s | 1558 ---------- ...plete_33556993_28678040_var_4_2_22_z4_10.s | 1550 ---------- ...plete_33556993_28678040_var_4_2_22_z4_11.s | 1550 ---------- ...plete_33556993_28678040_var_4_2_22_z4_12.s | 1550 ---------- ...plete_33556993_28678040_var_4_2_22_z4_13.s | 1550 ---------- ...plete_33556993_28678040_var_4_2_22_z4_14.s | 1550 ---------- ...plete_33556993_28678040_var_4_2_22_z4_15.s | 1550 ---------- ...mplete_33556993_28678040_var_4_2_22_z4_7.s | 1550 ---------- ...mplete_33556993_28678040_var_4_2_22_z4_8.s | 1550 ---------- ...mplete_33556993_28678040_var_4_2_22_z4_9.s | 1558 ---------- ...mplete_33556993_28678040_var_4_2_24_z4_0.s | 1494 ---------- ...plete_33556993_28678040_var_4_2_24_z4_16.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_3_z4_0.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_3_z4_1.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_3_z4_2.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_3_z4_3.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_3_z4_4.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_3_z4_5.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_0.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_1.s | 1494 ---------- ...mplete_33556993_28678040_var_4_2_7_z4_10.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_2.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_3.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_4.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_5.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_6.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_7.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_8.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_7_z4_9.s | 1502 ---------- ...omplete_33556993_28678040_var_4_2_8_z4_7.s | 1494 ---------- ...omplete_33556993_28678040_var_4_2_9_z4_7.s | 1494 ---------- 87 files changed, 98 insertions(+), 150337 deletions(-) rename tests/{ntt_neon => ntt-neon}/main.c (99%) create mode 100644 tests/ntt-neon/ntt-neon.mk rename tests/{ntt_neon => ntt-neon}/ntt.c (100%) rename tests/{ntt_neon => ntt-neon}/ntt.h (100%) delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s delete mode 100644 tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s diff --git a/Makefile b/Makefile index 842ab4b..a5988f2 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ include tests/helloworld/helloworld.mk include tests/keccak-neon/keccak-neon.mk include tests/ntt-dilithium/ntt-dilithium.mk include tests/ntt-kyber/ntt-kyber.mk +include tests/ntt-neon/ntt-neon.mk testname = $(shell echo $(1) | tr '[a-z]' '[A-Z]' | tr '-' '_' | tr '/' '_') testdir = $(addprefix $(2),tests/$(firstword $(subst /, ,$1))/) diff --git a/tests/ntt-dilithium/ntt-dilithium.mk b/tests/ntt-dilithium/ntt-dilithium.mk index b43bc67..fde34f2 100644 --- a/tests/ntt-dilithium/ntt-dilithium.mk +++ b/tests/ntt-dilithium/ntt-dilithium.mk @@ -40,16 +40,13 @@ NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_manual_st NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_manual_st4_opt_a72.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_manual_st4_opt_m1_firestorm.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_manual_st4_opt_m1_icestorm.s -#NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_manual_st4_opt_m1.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_manual_st4.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_opt_a55.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_opt_a72.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_opt_m1_firestorm.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_opt_m1_icestorm.s -#NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_opt_m1.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_w_scalar_opt_a55.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_w_scalar_opt_m1_icestorm.s -#NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_w_scalar_opt_m1.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678_w_scalar.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_123_45678.s NTT_DILITHIUM_ASMS += $(NTT_DILITHIUM_ASM_DIR)/ntt_dilithium_1234_5678_manual_st4_opt_m1_firestorm.s diff --git a/tests/ntt_neon/main.c b/tests/ntt-neon/main.c similarity index 99% rename from tests/ntt_neon/main.c rename to tests/ntt-neon/main.c index c9e277a..a4c15a5 100755 --- a/tests/ntt_neon/main.c +++ b/tests/ntt-neon/main.c @@ -29,6 +29,7 @@ #include + #include "ntt.h" int main(void) diff --git a/tests/ntt-neon/ntt-neon.mk b/tests/ntt-neon/ntt-neon.mk new file mode 100644 index 0000000..01cef79 --- /dev/null +++ b/tests/ntt-neon/ntt-neon.mk @@ -0,0 +1,96 @@ +# Test name - needs to match the directory name +TESTS += ntt-neon + +# All further variables must be prefixed with the capitalized test name + +# Platforms this test should run on (matching the directory name in envs/) +NTT_NEON_PLATFORMS += cross-v8a +NTT_NEON_PLATFORMS += cross-v84a + +# C sources required for this test +NTT_NEON_SOURCES += main.c +NTT_NEON_SOURCES += ntt.c + +# Assembly sources required for this test +NTT_NEON_ASMDIR = ../../asm/auto/ntt_neon +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_0_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_1_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_2_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_4_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_5_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_6_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_7_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_8_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_9_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_10_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_11_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_12_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_13_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_14_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_15_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_16_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_17_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_full_33556993_28678040_var_4_4_18_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s +NTT_NEON_ASMS += $(NTT_NEON_ASMDIR)/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s \ No newline at end of file diff --git a/tests/ntt_neon/ntt.c b/tests/ntt-neon/ntt.c similarity index 100% rename from tests/ntt_neon/ntt.c rename to tests/ntt-neon/ntt.c diff --git a/tests/ntt_neon/ntt.h b/tests/ntt-neon/ntt.h similarity index 100% rename from tests/ntt_neon/ntt.h rename to tests/ntt-neon/ntt.h diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s deleted file mode 100644 index 85f29ad..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_0_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_0_0 -.global _ntt_u32_full_neon_asm_var_4_4_0_0 -ntt_u32_full_neon_asm_var_4_4_0_0: -_ntt_u32_full_neon_asm_var_4_4_0_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -ldr q2, [x0, #544] -ldr q1, [x0, #608] -ldr q0, [x0, #672] -ldr q15, [x0, #736] -ldr q14, [x0, #32] -ldr q13, [x0, #96] -ldr q12, [x0, #160] -ldr q11, [x0, #224] -sqrdmulh v10.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v25.s[0] -mul v13.4S, v13.4S,v26.s[0] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v25.s[3] -mul v15.4S, v15.4S,v26.s[3] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v23.s[1] -mul v1.4S, v1.4S,v24.s[1] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v23.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -str q14, [x0, #32] -str q21, [x0, #96] -str q22, [x0, #160] -str q13, [x0, #224] -str q3, [x0, #288] -str q12, [x0, #352] -str q11, [x0, #416] -str q18, [x0, #480] -str q19, [x0, #544] -str q15, [x0, #608] -str q16, [x0, #672] -str q2, [x0, #736] -str q20, [x0, #800] -str q1, [x0, #864] -str q0, [x0, #928] -str q10, [x0, #992] -ldr q10, [x0, #816] -ldr q0, [x0, #880] -ldr q1, [x0, #944] -ldr q20, [x0, #1008] -ldr q2, [x0, #304] -ldr q16, [x0, #368] -ldr q15, [x0, #432] -ldr q19, [x0, #496] -ldr q18, [x0, #560] -ldr q11, [x0, #624] -ldr q12, [x0, #688] -ldr q3, [x0, #752] -ldr q13, [x0, #48] -ldr q22, [x0, #112] -ldr q21, [x0, #176] -ldr q14, [x0, #240] -sqrdmulh v17.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v21.4S, v27.s[0] -mul v21.4S, v21.4S,v28.s[0] -mla v21.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v27.s[3] -mul v16.4S, v16.4S,v28.s[3] -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v27.s[3] -mul v0.4S, v0.4S,v28.s[3] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v25.s[0] -mul v22.4S, v22.4S,v26.s[0] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v26.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v25.s[3] -mul v3.4S, v3.4S,v26.s[3] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v23.s[1] -mul v11.4S, v11.4S,v24.s[1] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v23.s[2] -mul v17.4S, v17.4S,v24.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v23.s[3] -mul v16.4S, v16.4S,v24.s[3] -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -str q13, [x0, #48] -str q0, [x0, #112] -str q10, [x0, #176] -str q22, [x0, #240] -str q19, [x0, #304] -str q21, [x0, #368] -str q14, [x0, #432] -str q2, [x0, #496] -str q20, [x0, #560] -str q3, [x0, #624] -str q15, [x0, #688] -str q18, [x0, #752] -str q1, [x0, #816] -str q11, [x0, #880] -str q12, [x0, #944] -str q17, [x0, #1008] -ldr q17, [x0, #768] -ldr q12, [x0, #832] -ldr q11, [x0, #896] -ldr q1, [x0, #960] -ldr q18, [x0, #256] -ldr q15, [x0, #320] -ldr q3, [x0, #384] -ldr q20, [x0, #448] -ldr q2, [x0, #512] -ldr q14, [x0, #576] -ldr q21, [x0, #640] -ldr q19, [x0, #704] -ldr q22, [x0, #0] -ldr q10, [x0, #64] -ldr q0, [x0, #128] -ldr q13, [x0, #192] -sqrdmulh v16.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v27.s[2] -mul v14.4S, v14.4S,v28.s[2] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v25.s[1] -mul v0.4S, v0.4S,v26.s[1] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v25.s[3] -mul v19.4S, v19.4S,v26.s[3] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v23.s[1] -mul v14.4S, v14.4S,v24.s[1] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -mla v16.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v23.s[3] -mul v15.4S, v15.4S,v24.s[3] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -str q22, [x0, #0] -str q12, [x0, #64] -str q17, [x0, #128] -str q10, [x0, #192] -str q20, [x0, #256] -str q0, [x0, #320] -str q13, [x0, #384] -str q18, [x0, #448] -str q1, [x0, #512] -str q19, [x0, #576] -str q3, [x0, #640] -str q2, [x0, #704] -str q11, [x0, #768] -str q14, [x0, #832] -str q21, [x0, #896] -str q16, [x0, #960] -ldr q16, [x0, #784] -ldr q21, [x0, #848] -ldr q14, [x0, #912] -ldr q11, [x0, #976] -ldr q2, [x0, #272] -ldr q3, [x0, #336] -ldr q19, [x0, #400] -ldr q1, [x0, #464] -ldr q18, [x0, #528] -ldr q13, [x0, #592] -ldr q0, [x0, #656] -ldr q20, [x0, #720] -ldr q10, [x0, #16] -ldr q17, [x0, #80] -ldr q12, [x0, #144] -ldr q22, [x0, #208] -sqrdmulh v15.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -mla v1.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v20.4s -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v26.s[0] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v25.s[3] -mul v20.4S, v20.4S,v26.s[3] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -str q10, [x0, #16] -str q21, [x0, #80] -str q16, [x0, #144] -str q17, [x0, #208] -str q1, [x0, #272] -str q12, [x0, #336] -str q22, [x0, #400] -str q2, [x0, #464] -str q11, [x0, #528] -str q20, [x0, #592] -str q19, [x0, #656] -str q18, [x0, #720] -str q14, [x0, #784] -str q13, [x0, #848] -str q0, [x0, #912] -str q15, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q9, [x17, #+208] -ldr q3, [x17, #+224] -ldr q10, [x17, #+240] -ldr q21, [x0, #32] -ldr q16, [x0, #48] -ldr q17, [x0, #0] -ldr q1, [x0, #16] -sqrdmulh v12.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v4.s[1] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -trn1 v21.4S, v17.4S, v16.4S -trn2 v22.4S, v17.4S, v16.4S -trn1 v2.4S, v12.4S, v1.4S -trn2 v11.4S, v12.4S, v1.4S -trn2 v12.2D, v21.2D, v2.2D -trn2 v1.2D, v22.2D, v11.2D -trn1 v17.2D, v21.2D, v2.2D -trn1 v16.2D, v22.2D, v11.2D -sqrdmulh v11.4S, v12.4S, v7.4S -mul v12.4S, v12.4S,v6.4S -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v7.4S -mul v1.4S, v1.4S,v6.4S -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v16.4S, v9.4S -mul v16.4S, v16.4S,v8.4S -mla v16.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v10.4S -mul v12.4S, v12.4S,v3.4S -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -str q17, [x0, #0] -str q1, [x0, #16] -str q11, [x0, #32] -str q16, [x0, #48] -ldr q16, [x17, #+256] -ldr q11, [x17, #+272] -ldr q1, [x17, #+288] -ldr q17, [x17, #+304] -ldr q12, [x17, #+320] -ldr q22, [x17, #+336] -ldr q2, [x17, #+352] -ldr q21, [x17, #+368] -ldr q10, [x0, #96] -ldr q3, [x0, #112] -ldr q9, [x0, #64] -ldr q8, [x0, #80] -sqrdmulh v7.4S, v10.4S, v11.s[0] -mul v10.4S, v10.4S,v16.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v11.s[0] -mul v3.4S, v3.4S,v16.s[0] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v8.4S, v11.s[1] -mul v8.4S, v8.4S,v16.s[1] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v11.s[2] -mul v10.4S, v10.4S,v16.s[2] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -trn1 v10.4S, v9.4S, v3.4S -trn2 v6.4S, v9.4S, v3.4S -trn1 v5.4S, v7.4S, v8.4S -trn2 v4.4S, v7.4S, v8.4S -trn2 v7.2D, v10.2D, v5.2D -trn2 v8.2D, v6.2D, v4.2D -trn1 v9.2D, v10.2D, v5.2D -trn1 v3.2D, v6.2D, v4.2D -sqrdmulh v4.4S, v7.4S, v17.4S -mul v7.4S, v7.4S,v1.4S -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v17.4S -mul v8.4S, v8.4S,v1.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -sqrdmulh v8.4S, v3.4S, v22.4S -mul v3.4S, v3.4S,v12.4S -mla v3.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v7.4S, v21.4S -mul v7.4S, v7.4S,v2.4S -mla v7.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -str q9, [x0, #64] -str q8, [x0, #80] -str q4, [x0, #96] -str q3, [x0, #112] -ldr q3, [x17, #+384] -ldr q4, [x17, #+400] -ldr q8, [x17, #+416] -ldr q9, [x17, #+432] -ldr q7, [x17, #+448] -ldr q6, [x17, #+464] -ldr q5, [x17, #+480] -ldr q10, [x17, #+496] -ldr q21, [x0, #160] -ldr q2, [x0, #176] -ldr q22, [x0, #128] -ldr q12, [x0, #144] -sqrdmulh v17.4S, v21.4S, v4.s[0] -mul v21.4S, v21.4S,v3.s[0] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v4.s[0] -mul v2.4S, v2.4S,v3.s[0] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v4.s[1] -mul v12.4S, v12.4S,v3.s[1] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v4.s[2] -mul v21.4S, v21.4S,v3.s[2] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -trn1 v21.4S, v22.4S, v2.4S -trn2 v1.4S, v22.4S, v2.4S -trn1 v11.4S, v17.4S, v12.4S -trn2 v16.4S, v17.4S, v12.4S -trn2 v17.2D, v21.2D, v11.2D -trn2 v12.2D, v1.2D, v16.2D -trn1 v22.2D, v21.2D, v11.2D -trn1 v2.2D, v1.2D, v16.2D -sqrdmulh v16.4S, v17.4S, v9.4S -mul v17.4S, v17.4S,v8.4S -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v9.4S -mul v12.4S, v12.4S,v8.4S -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v12.4s -add v2.4s, v2.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v6.4S -mul v2.4S, v2.4S,v7.4S -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v5.4S -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -str q22, [x0, #128] -str q12, [x0, #144] -str q16, [x0, #160] -str q2, [x0, #176] -ldr q2, [x17, #+512] -ldr q16, [x17, #+528] -ldr q12, [x17, #+544] -ldr q22, [x17, #+560] -ldr q17, [x17, #+576] -ldr q1, [x17, #+592] -ldr q11, [x17, #+608] -ldr q21, [x17, #+624] -ldr q10, [x0, #224] -ldr q5, [x0, #240] -ldr q6, [x0, #192] -ldr q7, [x0, #208] -sqrdmulh v9.4S, v10.4S, v16.s[0] -mul v10.4S, v10.4S,v2.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v16.s[0] -mul v5.4S, v5.4S,v2.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v16.s[1] -mul v7.4S, v7.4S,v2.s[1] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v10.4S, v16.s[2] -mul v10.4S, v10.4S,v2.s[2] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -trn1 v10.4S, v6.4S, v5.4S -trn2 v8.4S, v6.4S, v5.4S -trn1 v4.4S, v9.4S, v7.4S -trn2 v3.4S, v9.4S, v7.4S -trn2 v9.2D, v10.2D, v4.2D -trn2 v7.2D, v8.2D, v3.2D -trn1 v6.2D, v10.2D, v4.2D -trn1 v5.2D, v8.2D, v3.2D -sqrdmulh v3.4S, v9.4S, v22.4S -mul v9.4S, v9.4S,v12.4S -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v22.4S -mul v7.4S, v7.4S,v12.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v1.4S -mul v5.4S, v5.4S,v17.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v21.4S -mul v9.4S, v9.4S,v11.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -str q6, [x0, #192] -str q7, [x0, #208] -str q3, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q3, [x17, #+656] -ldr q7, [x17, #+672] -ldr q6, [x17, #+688] -ldr q9, [x17, #+704] -ldr q8, [x17, #+720] -ldr q4, [x17, #+736] -ldr q10, [x17, #+752] -ldr q21, [x0, #288] -ldr q11, [x0, #304] -ldr q1, [x0, #256] -ldr q17, [x0, #272] -sqrdmulh v22.4S, v21.4S, v3.s[0] -mul v21.4S, v21.4S,v5.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v3.s[0] -mul v11.4S, v11.4S,v5.s[0] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v3.s[1] -mul v17.4S, v17.4S,v5.s[1] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v3.s[2] -mul v21.4S, v21.4S,v5.s[2] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -trn1 v21.4S, v1.4S, v11.4S -trn2 v12.4S, v1.4S, v11.4S -trn1 v16.4S, v22.4S, v17.4S -trn2 v2.4S, v22.4S, v17.4S -trn2 v22.2D, v21.2D, v16.2D -trn2 v17.2D, v12.2D, v2.2D -trn1 v1.2D, v21.2D, v16.2D -trn1 v11.2D, v12.2D, v2.2D -sqrdmulh v2.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v8.4S -mul v11.4S, v11.4S,v9.4S -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v4.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -str q1, [x0, #256] -str q17, [x0, #272] -str q2, [x0, #288] -str q11, [x0, #304] -ldr q11, [x17, #+768] -ldr q2, [x17, #+784] -ldr q17, [x17, #+800] -ldr q1, [x17, #+816] -ldr q22, [x17, #+832] -ldr q12, [x17, #+848] -ldr q16, [x17, #+864] -ldr q21, [x17, #+880] -ldr q10, [x0, #352] -ldr q4, [x0, #368] -ldr q8, [x0, #320] -ldr q9, [x0, #336] -sqrdmulh v6.4S, v10.4S, v2.s[0] -mul v10.4S, v10.4S,v11.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v2.s[0] -mul v4.4S, v4.4S,v11.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v4.4s -add v9.4s, v9.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v2.s[1] -mul v9.4S, v9.4S,v11.s[1] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v2.s[2] -mul v10.4S, v10.4S,v11.s[2] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -trn1 v10.4S, v8.4S, v4.4S -trn2 v7.4S, v8.4S, v4.4S -trn1 v3.4S, v6.4S, v9.4S -trn2 v5.4S, v6.4S, v9.4S -trn2 v6.2D, v10.2D, v3.2D -trn2 v9.2D, v7.2D, v5.2D -trn1 v8.2D, v10.2D, v3.2D -trn1 v4.2D, v7.2D, v5.2D -sqrdmulh v5.4S, v6.4S, v1.4S -mul v6.4S, v6.4S,v17.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v1.4S -mul v9.4S, v9.4S,v17.4S -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v9.4s -add v4.4s, v4.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v12.4S -mul v4.4S, v4.4S,v22.4S -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v21.4S -mul v6.4S, v6.4S,v16.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -str q8, [x0, #320] -str q9, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q9, [x17, #+928] -ldr q8, [x17, #+944] -ldr q6, [x17, #+960] -ldr q7, [x17, #+976] -ldr q3, [x17, #+992] -ldr q10, [x17, #+1008] -ldr q21, [x0, #416] -ldr q16, [x0, #432] -ldr q12, [x0, #384] -ldr q22, [x0, #400] -sqrdmulh v1.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v5.s[1] -mul v22.4S, v22.4S,v4.s[1] -mla v22.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -trn1 v21.4S, v12.4S, v16.4S -trn2 v17.4S, v12.4S, v16.4S -trn1 v2.4S, v1.4S, v22.4S -trn2 v11.4S, v1.4S, v22.4S -trn2 v1.2D, v21.2D, v2.2D -trn2 v22.2D, v17.2D, v11.2D -trn1 v12.2D, v21.2D, v2.2D -trn1 v16.2D, v17.2D, v11.2D -sqrdmulh v11.4S, v1.4S, v8.4S -mul v1.4S, v1.4S,v9.4S -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v8.4S -mul v22.4S, v22.4S,v9.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v16.4s, v22.4s -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v7.4S -mul v16.4S, v16.4S,v6.4S -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v10.4S -mul v1.4S, v1.4S,v3.4S -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -str q12, [x0, #384] -str q22, [x0, #400] -str q11, [x0, #416] -str q16, [x0, #432] -ldr q16, [x17, #+1024] -ldr q11, [x17, #+1040] -ldr q22, [x17, #+1056] -ldr q12, [x17, #+1072] -ldr q1, [x17, #+1088] -ldr q17, [x17, #+1104] -ldr q2, [x17, #+1120] -ldr q21, [x17, #+1136] -ldr q10, [x0, #480] -ldr q3, [x0, #496] -ldr q7, [x0, #448] -ldr q6, [x0, #464] -sqrdmulh v8.4S, v10.4S, v11.s[0] -mul v10.4S, v10.4S,v16.s[0] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v11.s[0] -mul v3.4S, v3.4S,v16.s[0] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v3.4s -add v6.4s, v6.4s, v3.4s -sqrdmulh v3.4S, v6.4S, v11.s[1] -mul v6.4S, v6.4S,v16.s[1] -mla v6.4S, v3.4S, v31.s[0] -sub v3.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v11.s[2] -mul v10.4S, v10.4S,v16.s[2] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -trn1 v10.4S, v7.4S, v3.4S -trn2 v9.4S, v7.4S, v3.4S -trn1 v5.4S, v8.4S, v6.4S -trn2 v4.4S, v8.4S, v6.4S -trn2 v8.2D, v10.2D, v5.2D -trn2 v6.2D, v9.2D, v4.2D -trn1 v7.2D, v10.2D, v5.2D -trn1 v3.2D, v9.2D, v4.2D -sqrdmulh v4.4S, v8.4S, v12.4S -mul v8.4S, v8.4S,v22.4S -mla v8.4S, v4.4S, v31.s[0] -sub v4.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v12.4S -mul v6.4S, v6.4S,v22.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v3.4s, v6.4s -add v3.4s, v3.4s, v6.4s -sqrdmulh v6.4S, v3.4S, v17.4S -mul v3.4S, v3.4S,v1.4S -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -sqrdmulh v3.4S, v8.4S, v21.4S -mul v8.4S, v8.4S,v2.4S -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v8.4s -add v4.4s, v4.4s, v8.4s -str q7, [x0, #448] -str q6, [x0, #464] -str q4, [x0, #480] -str q3, [x0, #496] -ldr q3, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q7, [x17, #+1200] -ldr q8, [x17, #+1216] -ldr q9, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q10, [x17, #+1264] -ldr q21, [x0, #544] -ldr q2, [x0, #560] -ldr q17, [x0, #512] -ldr q1, [x0, #528] -sqrdmulh v12.4S, v21.4S, v4.s[0] -mul v21.4S, v21.4S,v3.s[0] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v4.s[0] -mul v2.4S, v2.4S,v3.s[0] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v4.s[1] -mul v1.4S, v1.4S,v3.s[1] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v4.s[2] -mul v21.4S, v21.4S,v3.s[2] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -trn1 v21.4S, v17.4S, v2.4S -trn2 v22.4S, v17.4S, v2.4S -trn1 v11.4S, v12.4S, v1.4S -trn2 v16.4S, v12.4S, v1.4S -trn2 v12.2D, v21.2D, v11.2D -trn2 v1.2D, v22.2D, v16.2D -trn1 v17.2D, v21.2D, v11.2D -trn1 v2.2D, v22.2D, v16.2D -sqrdmulh v16.4S, v12.4S, v7.4S -mul v12.4S, v12.4S,v6.4S -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v7.4S -mul v1.4S, v1.4S,v6.4S -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v9.4S -mul v2.4S, v2.4S,v8.4S -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v10.4S -mul v12.4S, v12.4S,v5.4S -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -str q17, [x0, #512] -str q1, [x0, #528] -str q16, [x0, #544] -str q2, [x0, #560] -ldr q2, [x17, #+1280] -ldr q16, [x17, #+1296] -ldr q1, [x17, #+1312] -ldr q17, [x17, #+1328] -ldr q12, [x17, #+1344] -ldr q22, [x17, #+1360] -ldr q11, [x17, #+1376] -ldr q21, [x17, #+1392] -ldr q10, [x0, #608] -ldr q5, [x0, #624] -ldr q9, [x0, #576] -ldr q8, [x0, #592] -sqrdmulh v7.4S, v10.4S, v16.s[0] -mul v10.4S, v10.4S,v2.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v16.s[0] -mul v5.4S, v5.4S,v2.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v5.4s -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v16.s[1] -mul v8.4S, v8.4S,v2.s[1] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v16.s[2] -mul v10.4S, v10.4S,v2.s[2] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -trn1 v10.4S, v9.4S, v5.4S -trn2 v6.4S, v9.4S, v5.4S -trn1 v4.4S, v7.4S, v8.4S -trn2 v3.4S, v7.4S, v8.4S -trn2 v7.2D, v10.2D, v4.2D -trn2 v8.2D, v6.2D, v3.2D -trn1 v9.2D, v10.2D, v4.2D -trn1 v5.2D, v6.2D, v3.2D -sqrdmulh v3.4S, v7.4S, v17.4S -mul v7.4S, v7.4S,v1.4S -mla v7.4S, v3.4S, v31.s[0] -sub v3.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v17.4S -mul v8.4S, v8.4S,v1.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v22.4S -mul v5.4S, v5.4S,v12.4S -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v21.4S -mul v7.4S, v7.4S,v11.4S -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -str q9, [x0, #576] -str q8, [x0, #592] -str q3, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q3, [x17, #+1424] -ldr q8, [x17, #+1440] -ldr q9, [x17, #+1456] -ldr q7, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q10, [x17, #+1520] -ldr q21, [x0, #672] -ldr q11, [x0, #688] -ldr q22, [x0, #640] -ldr q12, [x0, #656] -sqrdmulh v17.4S, v21.4S, v3.s[0] -mul v21.4S, v21.4S,v5.s[0] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v3.s[0] -mul v11.4S, v11.4S,v5.s[0] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v3.s[1] -mul v12.4S, v12.4S,v5.s[1] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v3.s[2] -mul v21.4S, v21.4S,v5.s[2] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -trn1 v21.4S, v22.4S, v11.4S -trn2 v1.4S, v22.4S, v11.4S -trn1 v16.4S, v17.4S, v12.4S -trn2 v2.4S, v17.4S, v12.4S -trn2 v17.2D, v21.2D, v16.2D -trn2 v12.2D, v1.2D, v2.2D -trn1 v22.2D, v21.2D, v16.2D -trn1 v11.2D, v1.2D, v2.2D -sqrdmulh v2.4S, v17.4S, v9.4S -mul v17.4S, v17.4S,v8.4S -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v9.4S -mul v12.4S, v12.4S,v8.4S -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v6.4S -mul v11.4S, v11.4S,v7.4S -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v4.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -str q22, [x0, #640] -str q12, [x0, #656] -str q2, [x0, #672] -str q11, [x0, #688] -ldr q11, [x17, #+1536] -ldr q2, [x17, #+1552] -ldr q12, [x17, #+1568] -ldr q22, [x17, #+1584] -ldr q17, [x17, #+1600] -ldr q1, [x17, #+1616] -ldr q16, [x17, #+1632] -ldr q21, [x17, #+1648] -ldr q10, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q7, [x0, #720] -sqrdmulh v9.4S, v10.4S, v2.s[0] -mul v10.4S, v10.4S,v11.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v2.s[0] -mul v4.4S, v4.4S,v11.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v2.s[1] -mul v7.4S, v7.4S,v11.s[1] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v10.4S, v2.s[2] -mul v10.4S, v10.4S,v11.s[2] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -trn1 v10.4S, v6.4S, v4.4S -trn2 v8.4S, v6.4S, v4.4S -trn1 v3.4S, v9.4S, v7.4S -trn2 v5.4S, v9.4S, v7.4S -trn2 v9.2D, v10.2D, v3.2D -trn2 v7.2D, v8.2D, v5.2D -trn1 v6.2D, v10.2D, v3.2D -trn1 v4.2D, v8.2D, v5.2D -sqrdmulh v5.4S, v9.4S, v22.4S -mul v9.4S, v9.4S,v12.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v22.4S -mul v7.4S, v7.4S,v12.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v1.4S -mul v4.4S, v4.4S,v17.4S -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v21.4S -mul v9.4S, v9.4S,v16.4S -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -str q6, [x0, #704] -str q7, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q7, [x17, #+1696] -ldr q6, [x17, #+1712] -ldr q9, [x17, #+1728] -ldr q8, [x17, #+1744] -ldr q3, [x17, #+1760] -ldr q10, [x17, #+1776] -ldr q21, [x0, #800] -ldr q16, [x0, #816] -ldr q1, [x0, #768] -ldr q17, [x0, #784] -sqrdmulh v22.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v5.s[1] -mul v17.4S, v17.4S,v4.s[1] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -trn1 v21.4S, v1.4S, v16.4S -trn2 v12.4S, v1.4S, v16.4S -trn1 v2.4S, v22.4S, v17.4S -trn2 v11.4S, v22.4S, v17.4S -trn2 v22.2D, v21.2D, v2.2D -trn2 v17.2D, v12.2D, v11.2D -trn1 v1.2D, v21.2D, v2.2D -trn1 v16.2D, v12.2D, v11.2D -sqrdmulh v11.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v8.4S -mul v16.4S, v16.4S,v9.4S -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v3.4S -mla v22.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -str q1, [x0, #768] -str q17, [x0, #784] -str q11, [x0, #800] -str q16, [x0, #816] -ldr q16, [x17, #+1792] -ldr q11, [x17, #+1808] -ldr q17, [x17, #+1824] -ldr q1, [x17, #+1840] -ldr q22, [x17, #+1856] -ldr q12, [x17, #+1872] -ldr q2, [x17, #+1888] -ldr q21, [x17, #+1904] -ldr q10, [x0, #864] -ldr q3, [x0, #880] -ldr q8, [x0, #832] -ldr q9, [x0, #848] -sqrdmulh v6.4S, v10.4S, v11.s[0] -mul v10.4S, v10.4S,v16.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v11.s[0] -mul v3.4S, v3.4S,v16.s[0] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v11.s[1] -mul v9.4S, v9.4S,v16.s[1] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v11.s[2] -mul v10.4S, v10.4S,v16.s[2] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -trn1 v10.4S, v8.4S, v3.4S -trn2 v7.4S, v8.4S, v3.4S -trn1 v5.4S, v6.4S, v9.4S -trn2 v4.4S, v6.4S, v9.4S -trn2 v6.2D, v10.2D, v5.2D -trn2 v9.2D, v7.2D, v4.2D -trn1 v8.2D, v10.2D, v5.2D -trn1 v3.2D, v7.2D, v4.2D -sqrdmulh v4.4S, v6.4S, v1.4S -mul v6.4S, v6.4S,v17.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v1.4S -mul v9.4S, v9.4S,v17.4S -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -sqrdmulh v9.4S, v3.4S, v12.4S -mul v3.4S, v3.4S,v22.4S -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v6.4S, v21.4S -mul v6.4S, v6.4S,v2.4S -mla v6.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -str q8, [x0, #832] -str q9, [x0, #848] -str q4, [x0, #864] -str q3, [x0, #880] -ldr q3, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q9, [x17, #+1952] -ldr q8, [x17, #+1968] -ldr q6, [x17, #+1984] -ldr q7, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q10, [x17, #+2032] -ldr q21, [x0, #928] -ldr q2, [x0, #944] -ldr q12, [x0, #896] -ldr q22, [x0, #912] -sqrdmulh v1.4S, v21.4S, v4.s[0] -mul v21.4S, v21.4S,v3.s[0] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v4.s[0] -mul v2.4S, v2.4S,v3.s[0] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v22.4S, v4.s[1] -mul v22.4S, v22.4S,v3.s[1] -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v4.s[2] -mul v21.4S, v21.4S,v3.s[2] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -trn1 v21.4S, v12.4S, v2.4S -trn2 v17.4S, v12.4S, v2.4S -trn1 v11.4S, v1.4S, v22.4S -trn2 v16.4S, v1.4S, v22.4S -trn2 v1.2D, v21.2D, v11.2D -trn2 v22.2D, v17.2D, v16.2D -trn1 v12.2D, v21.2D, v11.2D -trn1 v2.2D, v17.2D, v16.2D -sqrdmulh v16.4S, v1.4S, v8.4S -mul v1.4S, v1.4S,v9.4S -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v8.4S -mul v22.4S, v22.4S,v9.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v7.4S -mul v2.4S, v2.4S,v6.4S -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v10.4S -mul v1.4S, v1.4S,v5.4S -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -str q12, [x0, #896] -str q22, [x0, #912] -str q16, [x0, #928] -str q2, [x0, #944] -ldr q2, [x17, #+2048] -ldr q16, [x17, #+2064] -ldr q22, [x17, #+2080] -ldr q12, [x17, #+2096] -ldr q1, [x17, #+2112] -ldr q17, [x17, #+2128] -ldr q11, [x17, #+2144] -ldr q21, [x17, #+2160] -ldr q10, [x0, #992] -ldr q5, [x0, #1008] -ldr q7, [x0, #960] -ldr q6, [x0, #976] -sqrdmulh v8.4S, v10.4S, v16.s[0] -mul v10.4S, v10.4S,v2.s[0] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v16.s[0] -mul v5.4S, v5.4S,v2.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v16.s[1] -mul v6.4S, v6.4S,v2.s[1] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v16.s[2] -mul v10.4S, v10.4S,v2.s[2] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -trn1 v10.4S, v7.4S, v5.4S -trn2 v9.4S, v7.4S, v5.4S -trn1 v4.4S, v8.4S, v6.4S -trn2 v3.4S, v8.4S, v6.4S -trn2 v8.2D, v10.2D, v4.2D -trn2 v6.2D, v9.2D, v3.2D -trn1 v7.2D, v10.2D, v4.2D -trn1 v5.2D, v9.2D, v3.2D -sqrdmulh v3.4S, v8.4S, v12.4S -mul v8.4S, v8.4S,v22.4S -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v12.4S -mul v6.4S, v6.4S,v22.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v1.4S -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v21.4S -mul v8.4S, v8.4S,v11.4S -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -str q7, [x0, #960] -str q6, [x0, #976] -str q3, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s deleted file mode 100644 index c97d115..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_10_0.s +++ /dev/null @@ -1,2486 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_10_0 -.global _ntt_u32_full_neon_asm_var_4_4_10_0 -ntt_u32_full_neon_asm_var_4_4_10_0: -_ntt_u32_full_neon_asm_var_4_4_10_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -nop -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -nop -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -nop -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -nop -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -nop -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -nop -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -nop -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -nop -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -nop -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -nop -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -nop -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -nop -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -nop -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -nop -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -nop -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -nop -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -nop -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -nop -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -nop -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -nop -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -nop -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -nop -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -nop -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -nop -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -nop -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -nop -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -nop -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -nop -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -nop -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -nop -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -nop -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -nop -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -nop -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -nop -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -nop -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -nop -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -nop -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -nop -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -nop -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -nop -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -nop -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -nop -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -nop -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -nop -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -nop -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -nop -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -nop -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -nop -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -nop -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -nop -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -nop -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -nop -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -nop -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -nop -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -nop -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -nop -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -nop -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -nop -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -nop -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -nop -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -nop -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -nop -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -nop -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -nop -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x17, #+128] -ldr q1, [x17, #+144] -ldr q10, [x17, #+160] -ldr q21, [x17, #+176] -ldr q22, [x17, #+192] -ldr q15, [x17, #+208] -ldr q3, [x17, #+224] -ldr q12, [x17, #+240] -ldr q4, [x0, #32] -ldr q30, [x0, #48] -ldr q29, [x0, #0] -ldr q28, [x0, #16] -sqrdmulh v27.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v1.s[1] -mul v28.4S, v28.4S,v18.s[1] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v18.s[2] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -trn1 v4.4S, v29.4S, v30.4S -trn2 v26.4S, v29.4S, v30.4S -trn1 v25.4S, v27.4S, v28.4S -trn2 v24.4S, v27.4S, v28.4S -trn2 v27.2D, v4.2D, v25.2D -trn2 v28.2D, v26.2D, v24.2D -trn1 v29.2D, v4.2D, v25.2D -trn1 v30.2D, v26.2D, v24.2D -sqrdmulh v24.4S, v27.4S, v21.4S -mul v27.4S, v27.4S,v10.4S -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v21.4S -mul v28.4S, v28.4S,v10.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v30.4S, v15.4S -mul v30.4S, v30.4S,v22.4S -mla v30.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v12.4S -mul v27.4S, v27.4S,v3.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -str q29, [x0, #0] -str q28, [x0, #16] -str q24, [x0, #32] -str q30, [x0, #48] -ldr q30, [x17, #+256] -ldr q24, [x17, #+272] -ldr q28, [x17, #+288] -ldr q29, [x17, #+304] -ldr q27, [x17, #+320] -ldr q26, [x17, #+336] -ldr q25, [x17, #+352] -ldr q4, [x17, #+368] -ldr q12, [x0, #96] -ldr q3, [x0, #112] -ldr q15, [x0, #64] -ldr q22, [x0, #80] -sqrdmulh v21.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v24.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v24.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v22.4s -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v24.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -trn1 v12.4S, v15.4S, v3.4S -trn2 v10.4S, v15.4S, v3.4S -trn1 v1.4S, v21.4S, v22.4S -trn2 v18.4S, v21.4S, v22.4S -trn2 v21.2D, v12.2D, v1.2D -trn2 v22.2D, v10.2D, v18.2D -trn1 v15.2D, v12.2D, v1.2D -trn1 v3.2D, v10.2D, v18.2D -sqrdmulh v18.4S, v21.4S, v29.4S -mul v21.4S, v21.4S,v28.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.4S -mul v22.4S, v22.4S,v28.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v26.4S -mul v3.4S, v3.4S,v27.4S -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v3.4s -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v4.4S -mul v21.4S, v21.4S,v25.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -str q15, [x0, #64] -str q22, [x0, #80] -str q18, [x0, #96] -str q3, [x0, #112] -ldr q3, [x17, #+384] -ldr q18, [x17, #+400] -ldr q22, [x17, #+416] -ldr q15, [x17, #+432] -ldr q21, [x17, #+448] -ldr q10, [x17, #+464] -ldr q1, [x17, #+480] -ldr q12, [x17, #+496] -ldr q4, [x0, #160] -ldr q25, [x0, #176] -ldr q26, [x0, #128] -ldr q27, [x0, #144] -sqrdmulh v29.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v18.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v18.s[1] -mul v27.4S, v27.4S,v3.s[1] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v4.4S, v18.s[2] -mul v4.4S, v4.4S,v3.s[2] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -trn1 v4.4S, v26.4S, v25.4S -trn2 v28.4S, v26.4S, v25.4S -trn1 v24.4S, v29.4S, v27.4S -trn2 v30.4S, v29.4S, v27.4S -trn2 v29.2D, v4.2D, v24.2D -trn2 v27.2D, v28.2D, v30.2D -trn1 v26.2D, v4.2D, v24.2D -trn1 v25.2D, v28.2D, v30.2D -sqrdmulh v30.4S, v29.4S, v15.4S -mul v29.4S, v29.4S,v22.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v15.4S -mul v27.4S, v27.4S,v22.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v27.4s -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v10.4S -mul v25.4S, v25.4S,v21.4S -mla v25.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v1.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -str q26, [x0, #128] -str q27, [x0, #144] -str q30, [x0, #160] -str q25, [x0, #176] -ldr q25, [x17, #+512] -ldr q30, [x17, #+528] -ldr q27, [x17, #+544] -ldr q26, [x17, #+560] -ldr q29, [x17, #+576] -ldr q28, [x17, #+592] -ldr q24, [x17, #+608] -ldr q4, [x17, #+624] -ldr q12, [x0, #224] -ldr q1, [x0, #240] -ldr q10, [x0, #192] -ldr q21, [x0, #208] -sqrdmulh v15.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v30.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v30.s[1] -mul v21.4S, v21.4S,v25.s[1] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v30.s[2] -mul v12.4S, v12.4S,v25.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -trn1 v12.4S, v10.4S, v1.4S -trn2 v22.4S, v10.4S, v1.4S -trn1 v18.4S, v15.4S, v21.4S -trn2 v3.4S, v15.4S, v21.4S -trn2 v15.2D, v12.2D, v18.2D -trn2 v21.2D, v22.2D, v3.2D -trn1 v10.2D, v12.2D, v18.2D -trn1 v1.2D, v22.2D, v3.2D -sqrdmulh v3.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v27.4S -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v26.4S -mul v21.4S, v21.4S,v27.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v28.4S -mul v1.4S, v1.4S,v29.4S -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v4.4S -mul v15.4S, v15.4S,v24.4S -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -str q10, [x0, #192] -str q21, [x0, #208] -str q3, [x0, #224] -str q1, [x0, #240] -ldr q1, [x17, #+640] -ldr q3, [x17, #+656] -ldr q21, [x17, #+672] -ldr q10, [x17, #+688] -ldr q15, [x17, #+704] -ldr q22, [x17, #+720] -ldr q18, [x17, #+736] -ldr q12, [x17, #+752] -ldr q4, [x0, #288] -ldr q24, [x0, #304] -ldr q28, [x0, #256] -ldr q29, [x0, #272] -sqrdmulh v26.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v1.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -sqrdmulh v4.4S, v24.4S, v3.s[0] -mul v24.4S, v24.4S,v1.s[0] -mla v24.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v3.s[1] -mul v29.4S, v29.4S,v1.s[1] -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v3.s[2] -mul v4.4S, v4.4S,v1.s[2] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -trn1 v4.4S, v28.4S, v24.4S -trn2 v27.4S, v28.4S, v24.4S -trn1 v30.4S, v26.4S, v29.4S -trn2 v25.4S, v26.4S, v29.4S -trn2 v26.2D, v4.2D, v30.2D -trn2 v29.2D, v27.2D, v25.2D -trn1 v28.2D, v4.2D, v30.2D -trn1 v24.2D, v27.2D, v25.2D -sqrdmulh v25.4S, v26.4S, v10.4S -mul v26.4S, v26.4S,v21.4S -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v10.4S -mul v29.4S, v29.4S,v21.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v29.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v22.4S -mul v24.4S, v24.4S,v15.4S -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v26.4S, v12.4S -mul v26.4S, v26.4S,v18.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -str q28, [x0, #256] -str q29, [x0, #272] -str q25, [x0, #288] -str q24, [x0, #304] -ldr q24, [x17, #+768] -ldr q25, [x17, #+784] -ldr q29, [x17, #+800] -ldr q28, [x17, #+816] -ldr q26, [x17, #+832] -ldr q27, [x17, #+848] -ldr q30, [x17, #+864] -ldr q4, [x17, #+880] -ldr q12, [x0, #352] -ldr q18, [x0, #368] -ldr q22, [x0, #320] -ldr q15, [x0, #336] -sqrdmulh v10.4S, v12.4S, v25.s[0] -mul v12.4S, v12.4S,v24.s[0] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v24.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v25.s[2] -mul v12.4S, v12.4S,v24.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -trn1 v12.4S, v22.4S, v18.4S -trn2 v21.4S, v22.4S, v18.4S -trn1 v3.4S, v10.4S, v15.4S -trn2 v1.4S, v10.4S, v15.4S -trn2 v10.2D, v12.2D, v3.2D -trn2 v15.2D, v21.2D, v1.2D -trn1 v22.2D, v12.2D, v3.2D -trn1 v18.2D, v21.2D, v1.2D -sqrdmulh v1.4S, v10.4S, v28.4S -mul v10.4S, v10.4S,v29.4S -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v28.4S -mul v15.4S, v15.4S,v29.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v27.4S -mul v18.4S, v18.4S,v26.4S -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v4.4S -mul v10.4S, v10.4S,v30.4S -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v10.4s -add v1.4s, v1.4s, v10.4s -str q22, [x0, #320] -str q15, [x0, #336] -str q1, [x0, #352] -str q18, [x0, #368] -ldr q18, [x17, #+896] -ldr q1, [x17, #+912] -ldr q15, [x17, #+928] -ldr q22, [x17, #+944] -ldr q10, [x17, #+960] -ldr q21, [x17, #+976] -ldr q3, [x17, #+992] -ldr q12, [x17, #+1008] -ldr q4, [x0, #416] -ldr q30, [x0, #432] -ldr q27, [x0, #384] -ldr q26, [x0, #400] -sqrdmulh v28.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v1.s[1] -mul v26.4S, v26.4S,v18.s[1] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v18.s[2] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -trn1 v4.4S, v27.4S, v30.4S -trn2 v29.4S, v27.4S, v30.4S -trn1 v25.4S, v28.4S, v26.4S -trn2 v24.4S, v28.4S, v26.4S -trn2 v28.2D, v4.2D, v25.2D -trn2 v26.2D, v29.2D, v24.2D -trn1 v27.2D, v4.2D, v25.2D -trn1 v30.2D, v29.2D, v24.2D -sqrdmulh v24.4S, v28.4S, v22.4S -mul v28.4S, v28.4S,v15.4S -mla v28.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v22.4S -mul v26.4S, v26.4S,v15.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v21.4S -mul v30.4S, v30.4S,v10.4S -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v12.4S -mul v28.4S, v28.4S,v3.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -str q27, [x0, #384] -str q26, [x0, #400] -str q24, [x0, #416] -str q30, [x0, #432] -ldr q30, [x17, #+1024] -ldr q24, [x17, #+1040] -ldr q26, [x17, #+1056] -ldr q27, [x17, #+1072] -ldr q28, [x17, #+1088] -ldr q29, [x17, #+1104] -ldr q25, [x17, #+1120] -ldr q4, [x17, #+1136] -ldr q12, [x0, #480] -ldr q3, [x0, #496] -ldr q21, [x0, #448] -ldr q10, [x0, #464] -sqrdmulh v22.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v24.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v3.4s -add v10.4s, v10.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v24.s[1] -mul v10.4S, v10.4S,v30.s[1] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v12.4S, v24.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -trn1 v12.4S, v21.4S, v3.4S -trn2 v15.4S, v21.4S, v3.4S -trn1 v1.4S, v22.4S, v10.4S -trn2 v18.4S, v22.4S, v10.4S -trn2 v22.2D, v12.2D, v1.2D -trn2 v10.2D, v15.2D, v18.2D -trn1 v21.2D, v12.2D, v1.2D -trn1 v3.2D, v15.2D, v18.2D -sqrdmulh v18.4S, v22.4S, v27.4S -mul v22.4S, v22.4S,v26.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v27.4S -mul v10.4S, v10.4S,v26.4S -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v3.4s, v10.4s -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.4S -mul v3.4S, v3.4S,v28.4S -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v4.4S -mul v22.4S, v22.4S,v25.4S -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -str q21, [x0, #448] -str q10, [x0, #464] -str q18, [x0, #480] -str q3, [x0, #496] -ldr q3, [x17, #+1152] -ldr q18, [x17, #+1168] -ldr q10, [x17, #+1184] -ldr q21, [x17, #+1200] -ldr q22, [x17, #+1216] -ldr q15, [x17, #+1232] -ldr q1, [x17, #+1248] -ldr q12, [x17, #+1264] -ldr q4, [x0, #544] -ldr q25, [x0, #560] -ldr q29, [x0, #512] -ldr q28, [x0, #528] -sqrdmulh v27.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v18.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v18.s[1] -mul v28.4S, v28.4S,v3.s[1] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v18.s[2] -mul v4.4S, v4.4S,v3.s[2] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -trn1 v4.4S, v29.4S, v25.4S -trn2 v26.4S, v29.4S, v25.4S -trn1 v24.4S, v27.4S, v28.4S -trn2 v30.4S, v27.4S, v28.4S -trn2 v27.2D, v4.2D, v24.2D -trn2 v28.2D, v26.2D, v30.2D -trn1 v29.2D, v4.2D, v24.2D -trn1 v25.2D, v26.2D, v30.2D -sqrdmulh v30.4S, v27.4S, v21.4S -mul v27.4S, v27.4S,v10.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v21.4S -mul v28.4S, v28.4S,v10.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v25.4S, v15.4S -mul v25.4S, v25.4S,v22.4S -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v25.4s -add v29.4s, v29.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v12.4S -mul v27.4S, v27.4S,v1.4S -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -str q29, [x0, #512] -str q28, [x0, #528] -str q30, [x0, #544] -str q25, [x0, #560] -ldr q25, [x17, #+1280] -ldr q30, [x17, #+1296] -ldr q28, [x17, #+1312] -ldr q29, [x17, #+1328] -ldr q27, [x17, #+1344] -ldr q26, [x17, #+1360] -ldr q24, [x17, #+1376] -ldr q4, [x17, #+1392] -ldr q12, [x0, #608] -ldr q1, [x0, #624] -ldr q15, [x0, #576] -ldr q22, [x0, #592] -sqrdmulh v21.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v30.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v30.s[1] -mul v22.4S, v22.4S,v25.s[1] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v15.4s, v22.4s -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v30.s[2] -mul v12.4S, v12.4S,v25.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -trn1 v12.4S, v15.4S, v1.4S -trn2 v10.4S, v15.4S, v1.4S -trn1 v18.4S, v21.4S, v22.4S -trn2 v3.4S, v21.4S, v22.4S -trn2 v21.2D, v12.2D, v18.2D -trn2 v22.2D, v10.2D, v3.2D -trn1 v15.2D, v12.2D, v18.2D -trn1 v1.2D, v10.2D, v3.2D -sqrdmulh v3.4S, v21.4S, v29.4S -mul v21.4S, v21.4S,v28.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.4S -mul v22.4S, v22.4S,v28.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v26.4S -mul v1.4S, v1.4S,v27.4S -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v4.4S -mul v21.4S, v21.4S,v24.4S -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -str q15, [x0, #576] -str q22, [x0, #592] -str q3, [x0, #608] -str q1, [x0, #624] -ldr q1, [x17, #+1408] -ldr q3, [x17, #+1424] -ldr q22, [x17, #+1440] -ldr q15, [x17, #+1456] -ldr q21, [x17, #+1472] -ldr q10, [x17, #+1488] -ldr q18, [x17, #+1504] -ldr q12, [x17, #+1520] -ldr q4, [x0, #672] -ldr q24, [x0, #688] -ldr q26, [x0, #640] -ldr q27, [x0, #656] -sqrdmulh v29.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v1.s[0] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -sqrdmulh v4.4S, v24.4S, v3.s[0] -mul v24.4S, v24.4S,v1.s[0] -mla v24.4S, v4.4S, v31.s[0] -sub v4.4s, v27.4s, v24.4s -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v27.4S, v3.s[1] -mul v27.4S, v27.4S,v1.s[1] -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v4.4S, v3.s[2] -mul v4.4S, v4.4S,v1.s[2] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -trn1 v4.4S, v26.4S, v24.4S -trn2 v28.4S, v26.4S, v24.4S -trn1 v30.4S, v29.4S, v27.4S -trn2 v25.4S, v29.4S, v27.4S -trn2 v29.2D, v4.2D, v30.2D -trn2 v27.2D, v28.2D, v25.2D -trn1 v26.2D, v4.2D, v30.2D -trn1 v24.2D, v28.2D, v25.2D -sqrdmulh v25.4S, v29.4S, v15.4S -mul v29.4S, v29.4S,v22.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v15.4S -mul v27.4S, v27.4S,v22.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v10.4S -mul v24.4S, v24.4S,v21.4S -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v18.4S -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -str q26, [x0, #640] -str q27, [x0, #656] -str q25, [x0, #672] -str q24, [x0, #688] -ldr q24, [x17, #+1536] -ldr q25, [x17, #+1552] -ldr q27, [x17, #+1568] -ldr q26, [x17, #+1584] -ldr q29, [x17, #+1600] -ldr q28, [x17, #+1616] -ldr q30, [x17, #+1632] -ldr q4, [x17, #+1648] -ldr q12, [x0, #736] -ldr q18, [x0, #752] -ldr q10, [x0, #704] -ldr q21, [x0, #720] -sqrdmulh v15.4S, v12.4S, v25.s[0] -mul v12.4S, v12.4S,v24.s[0] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v24.s[1] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v25.s[2] -mul v12.4S, v12.4S,v24.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -trn1 v12.4S, v10.4S, v18.4S -trn2 v22.4S, v10.4S, v18.4S -trn1 v3.4S, v15.4S, v21.4S -trn2 v1.4S, v15.4S, v21.4S -trn2 v15.2D, v12.2D, v3.2D -trn2 v21.2D, v22.2D, v1.2D -trn1 v10.2D, v12.2D, v3.2D -trn1 v18.2D, v22.2D, v1.2D -sqrdmulh v1.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v27.4S -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v26.4S -mul v21.4S, v21.4S,v27.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v28.4S -mul v18.4S, v18.4S,v29.4S -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v4.4S -mul v15.4S, v15.4S,v30.4S -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -str q10, [x0, #704] -str q21, [x0, #720] -str q1, [x0, #736] -str q18, [x0, #752] -ldr q18, [x17, #+1664] -ldr q1, [x17, #+1680] -ldr q21, [x17, #+1696] -ldr q10, [x17, #+1712] -ldr q15, [x17, #+1728] -ldr q22, [x17, #+1744] -ldr q3, [x17, #+1760] -ldr q12, [x17, #+1776] -ldr q4, [x0, #800] -ldr q30, [x0, #816] -ldr q28, [x0, #768] -ldr q29, [x0, #784] -sqrdmulh v26.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v1.s[1] -mul v29.4S, v29.4S,v18.s[1] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v18.s[2] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -trn1 v4.4S, v28.4S, v30.4S -trn2 v27.4S, v28.4S, v30.4S -trn1 v25.4S, v26.4S, v29.4S -trn2 v24.4S, v26.4S, v29.4S -trn2 v26.2D, v4.2D, v25.2D -trn2 v29.2D, v27.2D, v24.2D -trn1 v28.2D, v4.2D, v25.2D -trn1 v30.2D, v27.2D, v24.2D -sqrdmulh v24.4S, v26.4S, v10.4S -mul v26.4S, v26.4S,v21.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v10.4S -mul v29.4S, v29.4S,v21.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v22.4S -mul v30.4S, v30.4S,v15.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v12.4S -mul v26.4S, v26.4S,v3.4S -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -str q28, [x0, #768] -str q29, [x0, #784] -str q24, [x0, #800] -str q30, [x0, #816] -ldr q30, [x17, #+1792] -ldr q24, [x17, #+1808] -ldr q29, [x17, #+1824] -ldr q28, [x17, #+1840] -ldr q26, [x17, #+1856] -ldr q27, [x17, #+1872] -ldr q25, [x17, #+1888] -ldr q4, [x17, #+1904] -ldr q12, [x0, #864] -ldr q3, [x0, #880] -ldr q22, [x0, #832] -ldr q15, [x0, #848] -sqrdmulh v10.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v24.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v3.4s -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v24.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v24.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -trn1 v12.4S, v22.4S, v3.4S -trn2 v21.4S, v22.4S, v3.4S -trn1 v1.4S, v10.4S, v15.4S -trn2 v18.4S, v10.4S, v15.4S -trn2 v10.2D, v12.2D, v1.2D -trn2 v15.2D, v21.2D, v18.2D -trn1 v22.2D, v12.2D, v1.2D -trn1 v3.2D, v21.2D, v18.2D -sqrdmulh v18.4S, v10.4S, v28.4S -mul v10.4S, v10.4S,v29.4S -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v28.4S -mul v15.4S, v15.4S,v29.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v27.4S -mul v3.4S, v3.4S,v26.4S -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v4.4S -mul v10.4S, v10.4S,v25.4S -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -str q22, [x0, #832] -str q15, [x0, #848] -str q18, [x0, #864] -str q3, [x0, #880] -ldr q3, [x17, #+1920] -ldr q18, [x17, #+1936] -ldr q15, [x17, #+1952] -ldr q22, [x17, #+1968] -ldr q10, [x17, #+1984] -ldr q21, [x17, #+2000] -ldr q1, [x17, #+2016] -ldr q12, [x17, #+2032] -ldr q4, [x0, #928] -ldr q25, [x0, #944] -ldr q27, [x0, #896] -ldr q26, [x0, #912] -sqrdmulh v28.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v18.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v18.s[1] -mul v26.4S, v26.4S,v3.s[1] -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v18.s[2] -mul v4.4S, v4.4S,v3.s[2] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -trn1 v4.4S, v27.4S, v25.4S -trn2 v29.4S, v27.4S, v25.4S -trn1 v24.4S, v28.4S, v26.4S -trn2 v30.4S, v28.4S, v26.4S -trn2 v28.2D, v4.2D, v24.2D -trn2 v26.2D, v29.2D, v30.2D -trn1 v27.2D, v4.2D, v24.2D -trn1 v25.2D, v29.2D, v30.2D -sqrdmulh v30.4S, v28.4S, v22.4S -mul v28.4S, v28.4S,v15.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v22.4S -mul v26.4S, v26.4S,v15.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v21.4S -mul v25.4S, v25.4S,v10.4S -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v12.4S -mul v28.4S, v28.4S,v1.4S -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -str q27, [x0, #896] -str q26, [x0, #912] -str q30, [x0, #928] -str q25, [x0, #944] -ldr q25, [x17, #+2048] -ldr q30, [x17, #+2064] -ldr q26, [x17, #+2080] -ldr q27, [x17, #+2096] -ldr q28, [x17, #+2112] -ldr q29, [x17, #+2128] -ldr q24, [x17, #+2144] -ldr q4, [x17, #+2160] -ldr q12, [x0, #992] -ldr q1, [x0, #1008] -ldr q21, [x0, #960] -ldr q10, [x0, #976] -sqrdmulh v22.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v30.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v30.s[1] -mul v10.4S, v10.4S,v25.s[1] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v12.4S, v30.s[2] -mul v12.4S, v12.4S,v25.s[2] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -trn1 v12.4S, v21.4S, v1.4S -trn2 v15.4S, v21.4S, v1.4S -trn1 v18.4S, v22.4S, v10.4S -trn2 v3.4S, v22.4S, v10.4S -trn2 v22.2D, v12.2D, v18.2D -trn2 v10.2D, v15.2D, v3.2D -trn1 v21.2D, v12.2D, v18.2D -trn1 v1.2D, v15.2D, v3.2D -sqrdmulh v3.4S, v22.4S, v27.4S -mul v22.4S, v22.4S,v26.4S -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v27.4S -mul v10.4S, v10.4S,v26.4S -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v10.4s -add v1.4s, v1.4s, v10.4s -sqrdmulh v10.4S, v1.4S, v29.4S -mul v1.4S, v1.4S,v28.4S -mla v1.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v4.4S -mul v22.4S, v22.4S,v24.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -str q21, [x0, #960] -str q10, [x0, #976] -str q3, [x0, #992] -str q1, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2456 -// Instruction count: 2452 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s deleted file mode 100644 index 4ee80f8..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_11_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_11_0 -.global _ntt_u32_full_neon_asm_var_4_4_11_0 -ntt_u32_full_neon_asm_var_4_4_11_0: -_ntt_u32_full_neon_asm_var_4_4_11_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #928] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #992] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #800] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #864] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #544] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #608] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -ldr q18, [x0, #672] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -ldr q17, [x0, #736] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #288] -ldr q1, [x0, #352] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #32] -ldr q25, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #160] -ldr q15, [x0, #224] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v28.s[1] -mul v16.4S, v16.4S,v29.s[1] -sqrdmulh v21.4S, v3.4S, v28.s[1] -sub v14.4s, v26.4s, v20.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v30.4S, v30.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v1.4S, v1.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v2.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v3.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v27.4S, v28.s[2] -mla v30.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v0.4S, v28.s[2] -mla v1.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v23.4S, v28.s[2] -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v2.4S, v2.4S,v29.s[2] -sub v10.4s, v24.4s, v16.4s -mul v27.4S, v27.4S,v29.s[2] -add v24.4s, v24.4s, v16.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v3.4s -mla v27.4S, v21.4S, v31.s[0] -add v15.4s, v15.4s, v3.4s -mul v0.4S, v0.4S,v29.s[2] -sub v3.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v29.s[2] -add v26.4s, v26.4s, v30.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v1.4s -mla v23.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v11.s[1] -mul v10.4S, v10.4S,v17.s[1] -sqrdmulh v19.4S, v18.4S, v11.s[1] -sub v30.4s, v12.4s, v2.4s -mul v18.4S, v18.4S,v17.s[1] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v11.s[0] -sub v21.4s, v22.4s, v27.4s -mul v24.4S, v24.4S,v17.s[0] -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v11.s[0] -sub v16.4s, v14.4s, v0.4s -mul v15.4S, v15.4S,v17.s[0] -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+64] -ldr q9, [x17, #+80] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v23.4s -sqrdmulh v8.4S, v12.4S, v11.s[2] -add v13.4s, v13.4s, v23.4s -mla v18.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v22.4S, v11.s[2] -mla v24.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v30.4S, v11.s[3] -mla v15.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v21.4S, v11.s[3] -ldr q23, [x17, #+96] -ldr q7, [x17, #+112] -mul v12.4S, v12.4S,v17.s[2] -sub v6.4s, v3.4s, v10.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v10.4s -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v22.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v30.4S, v30.4S,v17.s[3] -sub v18.4s, v26.4s, v24.4s -mul v21.4S, v21.4S,v17.s[3] -add v26.4s, v26.4s, v24.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v15.4s -mla v21.4S, v27.4S, v31.s[0] -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v9.s[2] -mul v20.4S, v20.4S,v0.s[2] -sqrdmulh v27.4S, v8.4S, v9.s[3] -sub v24.4s, v14.4s, v12.4s -mul v8.4S, v8.4S,v0.s[3] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v9.s[1] -sub v19.4s, v13.4s, v22.4s -mul v2.4S, v2.4S,v0.s[1] -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[0] -sub v10.4s, v16.4s, v30.4s -mul v25.4S, v25.4S,v0.s[0] -add v16.4s, v16.4s, v30.4s -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -sqrdmulh v30.4S, v13.4S, v7.s[0] -add v1.4s, v1.4s, v21.4s -mla v8.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v19.4S, v7.s[1] -mla v2.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v1.4S, v7.s[2] -mla v25.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v15.4S, v7.s[3] -mul v13.4S, v13.4S,v23.s[0] -sub v21.4s, v3.4s, v20.4s -str q21, [x0, #352] -mul v19.4S, v19.4S,v23.s[1] -add v3.4s, v3.4s, v20.4s -str q3, [x0, #288] -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v8.4s -str q30, [x0, #480] -mla v19.4S, v27.4S, v31.s[0] -add v6.4s, v6.4s, v8.4s -str q6, [x0, #416] -mul v1.4S, v1.4S,v23.s[2] -sub v6.4s, v18.4s, v2.4s -str q6, [x0, #224] -mul v15.4S, v15.4S,v23.s[3] -add v18.4s, v18.4s, v2.4s -str q18, [x0, #160] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v26.4s, v25.4s -str q12, [x0, #96] -mla v15.4S, v22.4S, v31.s[0] -add v26.4s, v26.4s, v25.4s -str q26, [x0, #32] -ldr q26, [x0, #944] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q22, [x0, #1008] -sqrdmulh v12.4S, v22.4S, v28.s[0] -sub v18.4s, v14.4s, v13.4s -str q18, [x0, #608] -mul v22.4S, v22.4S,v29.s[0] -add v14.4s, v14.4s, v13.4s -str q14, [x0, #544] -ldr q14, [x0, #816] -sqrdmulh v13.4S, v14.4S, v28.s[0] -sub v18.4s, v24.4s, v19.4s -str q18, [x0, #736] -mul v14.4S, v14.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -str q24, [x0, #672] -ldr q24, [x0, #880] -sqrdmulh v19.4S, v24.4S, v28.s[0] -sub v18.4s, v16.4s, v1.4s -str q18, [x0, #864] -mul v24.4S, v24.4S,v29.s[0] -add v16.4s, v16.4s, v1.4s -str q16, [x0, #800] -ldr q16, [x0, #560] -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v15.4s -str q25, [x0, #992] -sqrdmulh v25.4S, v16.4S, v28.s[0] -add v10.4s, v10.4s, v15.4s -str q10, [x0, #928] -ldr q10, [x0, #624] -mla v22.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v10.4S, v28.s[0] -ldr q15, [x0, #688] -mla v14.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v15.4S, v28.s[0] -ldr q1, [x0, #752] -mla v24.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v1.4S, v28.s[0] -ldr q18, [x0, #432] -ldr q2, [x0, #496] -mul v16.4S, v16.4S,v29.s[0] -sub v6.4s, v18.4s, v26.4s -mul v10.4S, v10.4S,v29.s[0] -add v18.4s, v18.4s, v26.4s -ldr q26, [x0, #304] -ldr q8, [x0, #368] -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v2.4s, v22.4s -mla v10.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v22.4s -ldr q22, [x0, #48] -ldr q12, [x0, #112] -mul v15.4S, v15.4S,v29.s[0] -sub v27.4s, v26.4s, v14.4s -mul v1.4S, v1.4S,v29.s[0] -add v26.4s, v26.4s, v14.4s -ldr q14, [x0, #176] -ldr q30, [x0, #240] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v24.4s -mla v1.4S, v19.4S, v31.s[0] -add v8.4s, v8.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v28.s[1] -mul v18.4S, v18.4S,v29.s[1] -sqrdmulh v19.4S, v2.4S, v28.s[1] -sub v3.4s, v22.4s, v16.4s -mul v2.4S, v2.4S,v29.s[1] -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v28.s[1] -sub v20.4s, v12.4s, v10.4s -mul v26.4S, v26.4S,v29.s[1] -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v28.s[1] -sub v21.4s, v14.4s, v15.4s -mul v8.4S, v8.4S,v29.s[1] -add v14.4s, v14.4s, v15.4s -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v30.4s, v1.4s -sqrdmulh v15.4S, v6.4S, v28.s[2] -add v30.4s, v30.4s, v1.4s -mla v2.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v25.4S, v28.s[2] -mla v26.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v27.4S, v28.s[2] -mla v8.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v13.4S, v28.s[2] -mul v6.4S, v6.4S,v29.s[2] -sub v1.4s, v14.4s, v18.4s -mul v25.4S, v25.4S,v29.s[2] -add v14.4s, v14.4s, v18.4s -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v2.4s -mla v25.4S, v19.4S, v31.s[0] -add v30.4s, v30.4s, v2.4s -mul v27.4S, v27.4S,v29.s[2] -sub v2.4s, v22.4s, v26.4s -mul v13.4S, v13.4S,v29.s[2] -add v22.4s, v22.4s, v26.4s -mla v27.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v8.4s -mla v13.4S, v10.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v1.4S, v11.s[1] -mul v1.4S, v1.4S,v17.s[1] -sqrdmulh v10.4S, v15.4S, v11.s[1] -sub v26.4s, v21.4s, v6.4s -mul v15.4S, v15.4S,v17.s[1] -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v14.4S, v11.s[0] -sub v19.4s, v24.4s, v25.4s -mul v14.4S, v14.4S,v17.s[0] -add v24.4s, v24.4s, v25.4s -sqrdmulh v25.4S, v30.4S, v11.s[0] -sub v18.4s, v3.4s, v27.4s -mul v30.4S, v30.4S,v17.s[0] -add v3.4s, v3.4s, v27.4s -mla v1.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v13.4s -sqrdmulh v27.4S, v21.4S, v11.s[2] -add v20.4s, v20.4s, v13.4s -mla v15.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v24.4S, v11.s[2] -mla v14.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v26.4S, v11.s[3] -mla v30.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v11.s[3] -mul v21.4S, v21.4S,v17.s[2] -sub v13.4s, v2.4s, v1.4s -mul v24.4S, v24.4S,v17.s[2] -add v2.4s, v2.4s, v1.4s -mla v21.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v15.4s -mla v24.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v15.4s -mul v26.4S, v26.4S,v17.s[3] -sub v15.4s, v22.4s, v14.4s -mul v19.4S, v19.4S,v17.s[3] -add v22.4s, v22.4s, v14.4s -mla v26.4S, v6.4S, v31.s[0] -sub v6.4s, v12.4s, v30.4s -mla v19.4S, v25.4S, v31.s[0] -add v12.4s, v12.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v9.s[2] -mul v16.4S, v16.4S,v0.s[2] -sqrdmulh v25.4S, v27.4S, v9.s[3] -sub v14.4s, v3.4s, v21.4s -mul v27.4S, v27.4S,v0.s[3] -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v6.4S, v9.s[1] -sub v10.4s, v20.4s, v24.4s -mul v6.4S, v6.4S,v0.s[1] -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v9.s[0] -sub v1.4s, v18.4s, v26.4s -mul v12.4S, v12.4S,v0.s[0] -add v18.4s, v18.4s, v26.4s -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v19.4s -sqrdmulh v26.4S, v20.4S, v7.s[0] -add v8.4s, v8.4s, v19.4s -mla v27.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v10.4S, v7.s[1] -mla v6.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v8.4S, v7.s[2] -mla v12.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v30.4S, v7.s[3] -mul v20.4S, v20.4S,v23.s[0] -sub v19.4s, v2.4s, v16.4s -str q19, [x0, #368] -mul v10.4S, v10.4S,v23.s[1] -add v2.4s, v2.4s, v16.4s -str q2, [x0, #304] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v13.4s, v27.4s -str q26, [x0, #496] -mla v10.4S, v25.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -str q13, [x0, #432] -mul v8.4S, v8.4S,v23.s[2] -sub v13.4s, v15.4s, v6.4s -str q13, [x0, #240] -mul v30.4S, v30.4S,v23.s[3] -add v15.4s, v15.4s, v6.4s -str q15, [x0, #176] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v12.4s -str q21, [x0, #112] -mla v30.4S, v24.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -str q22, [x0, #48] -ldr q22, [x0, #896] -sqrdmulh v12.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q24, [x0, #960] -sqrdmulh v21.4S, v24.4S, v28.s[0] -sub v15.4s, v3.4s, v20.4s -str q15, [x0, #624] -mul v24.4S, v24.4S,v29.s[0] -add v3.4s, v3.4s, v20.4s -str q3, [x0, #560] -ldr q3, [x0, #768] -sqrdmulh v20.4S, v3.4S, v28.s[0] -sub v15.4s, v14.4s, v10.4s -str q15, [x0, #752] -mul v3.4S, v3.4S,v29.s[0] -add v14.4s, v14.4s, v10.4s -str q14, [x0, #688] -ldr q14, [x0, #832] -sqrdmulh v10.4S, v14.4S, v28.s[0] -sub v15.4s, v18.4s, v8.4s -str q15, [x0, #880] -mul v14.4S, v14.4S,v29.s[0] -add v18.4s, v18.4s, v8.4s -str q18, [x0, #816] -ldr q18, [x0, #512] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v30.4s -str q12, [x0, #1008] -sqrdmulh v12.4S, v18.4S, v28.s[0] -add v1.4s, v1.4s, v30.4s -str q1, [x0, #944] -ldr q1, [x0, #576] -mla v24.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v1.4S, v28.s[0] -ldr q30, [x0, #640] -mla v3.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v30.4S, v28.s[0] -ldr q8, [x0, #704] -mla v14.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v8.4S, v28.s[0] -ldr q15, [x0, #384] -ldr q6, [x0, #448] -mul v18.4S, v18.4S,v29.s[0] -sub v13.4s, v15.4s, v22.4s -mul v1.4S, v1.4S,v29.s[0] -add v15.4s, v15.4s, v22.4s -ldr q22, [x0, #256] -ldr q27, [x0, #320] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v24.4s -mla v1.4S, v21.4S, v31.s[0] -add v6.4s, v6.4s, v24.4s -ldr q24, [x0, #0] -ldr q21, [x0, #64] -mul v30.4S, v30.4S,v29.s[0] -sub v25.4s, v22.4s, v3.4s -mul v8.4S, v8.4S,v29.s[0] -add v22.4s, v22.4s, v3.4s -ldr q3, [x0, #128] -ldr q26, [x0, #192] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v27.4s, v14.4s -mla v8.4S, v10.4S, v31.s[0] -add v27.4s, v27.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v28.s[1] -mul v15.4S, v15.4S,v29.s[1] -sqrdmulh v10.4S, v6.4S, v28.s[1] -sub v2.4s, v24.4s, v18.4s -mul v6.4S, v6.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v28.s[1] -sub v16.4s, v21.4s, v1.4s -mul v22.4S, v22.4S,v29.s[1] -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v27.4S, v28.s[1] -sub v19.4s, v3.4s, v30.4s -mul v27.4S, v27.4S,v29.s[1] -add v3.4s, v3.4s, v30.4s -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v26.4s, v8.4s -sqrdmulh v30.4S, v13.4S, v28.s[2] -add v26.4s, v26.4s, v8.4s -mla v6.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v12.4S, v28.s[2] -mla v22.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v25.4S, v28.s[2] -mla v27.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v20.4S, v28.s[2] -mul v13.4S, v13.4S,v29.s[2] -sub v8.4s, v3.4s, v15.4s -mul v12.4S, v12.4S,v29.s[2] -add v3.4s, v3.4s, v15.4s -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v6.4s -mla v12.4S, v10.4S, v31.s[0] -add v26.4s, v26.4s, v6.4s -mul v25.4S, v25.4S,v29.s[2] -sub v6.4s, v24.4s, v22.4s -mul v20.4S, v20.4S,v29.s[2] -add v24.4s, v24.4s, v22.4s -mla v25.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v27.4s -mla v20.4S, v1.4S, v31.s[0] -add v21.4s, v21.4s, v27.4s -sqrdmulh v27.4S, v8.4S, v11.s[1] -mul v8.4S, v8.4S,v17.s[1] -sqrdmulh v1.4S, v30.4S, v11.s[1] -sub v22.4s, v19.4s, v13.4s -mul v30.4S, v30.4S,v17.s[1] -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v3.4S, v11.s[0] -sub v10.4s, v14.4s, v12.4s -mul v3.4S, v3.4S,v17.s[0] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v11.s[0] -sub v15.4s, v2.4s, v25.4s -mul v26.4S, v26.4S,v17.s[0] -add v2.4s, v2.4s, v25.4s -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v20.4s -sqrdmulh v25.4S, v19.4S, v11.s[2] -add v16.4s, v16.4s, v20.4s -mla v30.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v14.4S, v11.s[2] -mla v3.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v22.4S, v11.s[3] -mla v26.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v10.4S, v11.s[3] -mul v19.4S, v19.4S,v17.s[2] -sub v20.4s, v6.4s, v8.4s -mul v14.4S, v14.4S,v17.s[2] -add v6.4s, v6.4s, v8.4s -mla v19.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v30.4s -mla v14.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -mul v22.4S, v22.4S,v17.s[3] -sub v30.4s, v24.4s, v3.4s -mul v10.4S, v10.4S,v17.s[3] -add v24.4s, v24.4s, v3.4s -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v26.4s -mla v10.4S, v12.4S, v31.s[0] -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v9.s[2] -mul v18.4S, v18.4S,v0.s[2] -sqrdmulh v12.4S, v25.4S, v9.s[3] -sub v3.4s, v2.4s, v19.4s -mul v25.4S, v25.4S,v0.s[3] -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v9.s[1] -sub v1.4s, v16.4s, v14.4s -mul v13.4S, v13.4S,v0.s[1] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v9.s[0] -sub v8.4s, v15.4s, v22.4s -mul v21.4S, v21.4S,v0.s[0] -add v15.4s, v15.4s, v22.4s -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v10.4s -sqrdmulh v22.4S, v16.4S, v7.s[0] -add v27.4s, v27.4s, v10.4s -mla v25.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v1.4S, v7.s[1] -mla v13.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v21.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v26.4S, v7.s[3] -mul v16.4S, v16.4S,v23.s[0] -sub v10.4s, v6.4s, v18.4s -str q10, [x0, #320] -mul v1.4S, v1.4S,v23.s[1] -add v6.4s, v6.4s, v18.4s -str q6, [x0, #256] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v25.4s -str q22, [x0, #448] -mla v1.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v25.4s -str q20, [x0, #384] -mul v27.4S, v27.4S,v23.s[2] -sub v20.4s, v30.4s, v13.4s -str q20, [x0, #192] -mul v26.4S, v26.4S,v23.s[3] -add v30.4s, v30.4s, v13.4s -str q30, [x0, #128] -mla v27.4S, v19.4S, v31.s[0] -sub v19.4s, v24.4s, v21.4s -str q19, [x0, #64] -mla v26.4S, v14.4S, v31.s[0] -add v24.4s, v24.4s, v21.4s -str q24, [x0, #0] -ldr q24, [x0, #912] -sqrdmulh v21.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q14, [x0, #976] -sqrdmulh v19.4S, v14.4S, v28.s[0] -sub v30.4s, v2.4s, v16.4s -str q30, [x0, #576] -mul v14.4S, v14.4S,v29.s[0] -add v2.4s, v2.4s, v16.4s -str q2, [x0, #512] -ldr q2, [x0, #784] -sqrdmulh v16.4S, v2.4S, v28.s[0] -sub v30.4s, v3.4s, v1.4s -str q30, [x0, #704] -mul v2.4S, v2.4S,v29.s[0] -add v3.4s, v3.4s, v1.4s -str q3, [x0, #640] -ldr q3, [x0, #848] -sqrdmulh v1.4S, v3.4S, v28.s[0] -sub v30.4s, v15.4s, v27.4s -str q30, [x0, #832] -mul v3.4S, v3.4S,v29.s[0] -add v15.4s, v15.4s, v27.4s -str q15, [x0, #768] -ldr q15, [x0, #528] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v26.4s -str q21, [x0, #960] -sqrdmulh v21.4S, v15.4S, v28.s[0] -add v8.4s, v8.4s, v26.4s -str q8, [x0, #896] -ldr q8, [x0, #592] -mla v14.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v8.4S, v28.s[0] -ldr q26, [x0, #656] -mla v2.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v26.4S, v28.s[0] -ldr q27, [x0, #720] -mla v3.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v27.4S, v28.s[0] -ldr q30, [x0, #400] -ldr q13, [x0, #464] -mul v15.4S, v15.4S,v29.s[0] -sub v20.4s, v30.4s, v24.4s -mul v8.4S, v8.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #272] -ldr q25, [x0, #336] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v14.4s -mla v8.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -ldr q14, [x0, #16] -ldr q19, [x0, #80] -mul v26.4S, v26.4S,v29.s[0] -sub v12.4s, v24.4s, v2.4s -mul v27.4S, v27.4S,v29.s[0] -add v24.4s, v24.4s, v2.4s -ldr q2, [x0, #144] -ldr q22, [x0, #208] -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v3.4s -mla v27.4S, v1.4S, v31.s[0] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v28.s[1] -mul v30.4S, v30.4S,v29.s[1] -sqrdmulh v1.4S, v13.4S, v28.s[1] -sub v6.4s, v14.4s, v15.4s -mul v13.4S, v13.4S,v29.s[1] -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v24.4S, v28.s[1] -sub v18.4s, v19.4s, v8.4s -mul v24.4S, v24.4S,v29.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v25.4S, v28.s[1] -sub v10.4s, v2.4s, v26.4s -mul v25.4S, v25.4S,v29.s[1] -add v2.4s, v2.4s, v26.4s -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v27.4s -sqrdmulh v26.4S, v20.4S, v28.s[2] -add v22.4s, v22.4s, v27.4s -mla v13.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v21.4S, v28.s[2] -mla v24.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v12.4S, v28.s[2] -mla v25.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v16.4S, v28.s[2] -mul v20.4S, v20.4S,v29.s[2] -sub v27.4s, v2.4s, v30.4s -mul v21.4S, v21.4S,v29.s[2] -add v2.4s, v2.4s, v30.4s -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v13.4s -mla v21.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -mul v12.4S, v12.4S,v29.s[2] -sub v13.4s, v14.4s, v24.4s -mul v16.4S, v16.4S,v29.s[2] -add v14.4s, v14.4s, v24.4s -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v25.4s -mla v16.4S, v8.4S, v31.s[0] -add v19.4s, v19.4s, v25.4s -sqrdmulh v28.4S, v27.4S, v11.s[1] -mul v27.4S, v27.4S,v17.s[1] -sqrdmulh v29.4S, v26.4S, v11.s[1] -sub v25.4s, v10.4s, v20.4s -mul v26.4S, v26.4S,v17.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v11.s[0] -sub v8.4s, v3.4s, v21.4s -mul v2.4S, v2.4S,v17.s[0] -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v11.s[0] -sub v24.4s, v6.4s, v12.4s -mul v22.4S, v22.4S,v17.s[0] -add v6.4s, v6.4s, v12.4s -mla v27.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v16.4s -sqrdmulh v12.4S, v10.4S, v11.s[2] -add v18.4s, v18.4s, v16.4s -mla v26.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v3.4S, v11.s[2] -mla v2.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v25.4S, v11.s[3] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v8.4S, v11.s[3] -mul v10.4S, v10.4S,v17.s[2] -sub v16.4s, v13.4s, v27.4s -mul v3.4S, v3.4S,v17.s[2] -add v13.4s, v13.4s, v27.4s -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v26.4s -mla v3.4S, v29.4S, v31.s[0] -add v15.4s, v15.4s, v26.4s -mul v25.4S, v25.4S,v17.s[3] -sub v26.4s, v14.4s, v2.4s -mul v8.4S, v8.4S,v17.s[3] -add v14.4s, v14.4s, v2.4s -mla v25.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v22.4s -mla v8.4S, v21.4S, v31.s[0] -add v19.4s, v19.4s, v22.4s -sqrdmulh v11.4S, v15.4S, v9.s[2] -mul v15.4S, v15.4S,v0.s[2] -sqrdmulh v17.4S, v12.4S, v9.s[3] -sub v22.4s, v6.4s, v10.4s -mul v12.4S, v12.4S,v0.s[3] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v20.4S, v9.s[1] -sub v21.4s, v18.4s, v3.4s -mul v20.4S, v20.4S,v0.s[1] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v9.s[0] -sub v2.4s, v24.4s, v25.4s -mul v19.4S, v19.4S,v0.s[0] -add v24.4s, v24.4s, v25.4s -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v8.4s -sqrdmulh v25.4S, v18.4S, v7.s[0] -add v28.4s, v28.4s, v8.4s -mla v12.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v21.4S, v7.s[1] -mla v20.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v28.4S, v7.s[2] -mla v19.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v11.4S, v7.s[3] -mul v18.4S, v18.4S,v23.s[0] -sub v8.4s, v13.4s, v15.4s -str q8, [x0, #336] -mul v21.4S, v21.4S,v23.s[1] -add v13.4s, v13.4s, v15.4s -str q13, [x0, #272] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v12.4s -str q25, [x0, #464] -mla v21.4S, v17.4S, v31.s[0] -add v16.4s, v16.4s, v12.4s -str q16, [x0, #400] -mul v28.4S, v28.4S,v23.s[2] -sub v16.4s, v26.4s, v20.4s -str q16, [x0, #208] -mul v11.4S, v11.4S,v23.s[3] -add v26.4s, v26.4s, v20.4s -str q26, [x0, #144] -mla v28.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v19.4s -str q10, [x0, #80] -mla v11.4S, v3.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q14, [x0, #16] -sub v7.4s, v6.4s, v18.4s -str q7, [x0, #592] -add v6.4s, v6.4s, v18.4s -str q6, [x0, #528] -sub v6.4s, v22.4s, v21.4s -str q6, [x0, #720] -add v22.4s, v22.4s, v21.4s -str q22, [x0, #656] -sub v22.4s, v24.4s, v28.4s -str q22, [x0, #848] -add v24.4s, v24.4s, v28.4s -str q24, [x0, #784] -sub v24.4s, v2.4s, v11.4s -str q24, [x0, #976] -add v2.4s, v2.4s, v11.4s -str q2, [x0, #912] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q30, [x17, #+160] -ldr q1, [x17, #+176] -ldr q27, [x17, #+192] -ldr q29, [x17, #+208] -ldr q8, [x17, #+224] -ldr q15, [x17, #+240] -ldr q13, [x0, #32] -ldr q25, [x0, #48] -ldr q17, [x0, #0] -ldr q12, [x0, #16] -sqrdmulh v16.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v25.4s -add v12.4s, v12.4s, v25.4s -sqrdmulh v25.4S, v12.4S, v5.s[1] -mul v12.4S, v12.4S,v4.s[1] -mla v12.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -trn1 v13.4S, v17.4S, v25.4S -trn2 v20.4S, v17.4S, v25.4S -trn1 v26.4S, v16.4S, v12.4S -trn2 v10.4S, v16.4S, v12.4S -trn2 v16.2D, v13.2D, v26.2D -trn2 v12.2D, v20.2D, v10.2D -trn1 v17.2D, v13.2D, v26.2D -trn1 v25.2D, v20.2D, v10.2D -sqrdmulh v10.4S, v16.4S, v1.4S -mul v16.4S, v16.4S,v30.4S -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v1.4S -mul v12.4S, v12.4S,v30.4S -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v12.4s -add v25.4s, v25.4s, v12.4s -sqrdmulh v12.4S, v25.4S, v29.4S -mul v25.4S, v25.4S,v27.4S -mla v25.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v16.4S, v15.4S -mul v16.4S, v16.4S,v8.4S -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -str q17, [x0, #0] -str q12, [x0, #16] -str q10, [x0, #32] -str q25, [x0, #48] -ldr q25, [x17, #+256] -ldr q10, [x17, #+272] -ldr q12, [x17, #+288] -ldr q17, [x17, #+304] -ldr q16, [x17, #+320] -ldr q20, [x17, #+336] -ldr q26, [x17, #+352] -ldr q13, [x17, #+368] -ldr q15, [x0, #96] -ldr q8, [x0, #112] -ldr q29, [x0, #64] -ldr q27, [x0, #80] -sqrdmulh v1.4S, v15.4S, v10.s[0] -mul v15.4S, v15.4S,v25.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v10.s[0] -mul v8.4S, v8.4S,v25.s[0] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v27.4s, v8.4s -add v27.4s, v27.4s, v8.4s -sqrdmulh v8.4S, v27.4S, v10.s[1] -mul v27.4S, v27.4S,v25.s[1] -mla v27.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v10.s[2] -mul v15.4S, v15.4S,v25.s[2] -mla v15.4S, v27.4S, v31.s[0] -sub v27.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -trn1 v15.4S, v29.4S, v8.4S -trn2 v30.4S, v29.4S, v8.4S -trn1 v5.4S, v1.4S, v27.4S -trn2 v4.4S, v1.4S, v27.4S -trn2 v1.2D, v15.2D, v5.2D -trn2 v27.2D, v30.2D, v4.2D -trn1 v29.2D, v15.2D, v5.2D -trn1 v8.2D, v30.2D, v4.2D -sqrdmulh v4.4S, v1.4S, v17.4S -mul v1.4S, v1.4S,v12.4S -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sqrdmulh v1.4S, v27.4S, v17.4S -mul v27.4S, v27.4S,v12.4S -mla v27.4S, v1.4S, v31.s[0] -sub v1.4s, v8.4s, v27.4s -add v8.4s, v8.4s, v27.4s -sqrdmulh v27.4S, v8.4S, v20.4S -mul v8.4S, v8.4S,v16.4S -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v1.4S, v13.4S -mul v1.4S, v1.4S,v26.4S -mla v1.4S, v8.4S, v31.s[0] -sub v8.4s, v4.4s, v1.4s -add v4.4s, v4.4s, v1.4s -str q29, [x0, #64] -str q27, [x0, #80] -str q4, [x0, #96] -str q8, [x0, #112] -ldr q8, [x17, #+384] -ldr q4, [x17, #+400] -ldr q27, [x17, #+416] -ldr q29, [x17, #+432] -ldr q1, [x17, #+448] -ldr q30, [x17, #+464] -ldr q5, [x17, #+480] -ldr q15, [x17, #+496] -ldr q13, [x0, #160] -ldr q26, [x0, #176] -ldr q20, [x0, #128] -ldr q16, [x0, #144] -sqrdmulh v17.4S, v13.4S, v4.s[0] -mul v13.4S, v13.4S,v8.s[0] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v4.s[0] -mul v26.4S, v26.4S,v8.s[0] -mla v26.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v4.s[1] -mul v16.4S, v16.4S,v8.s[1] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v4.s[2] -mul v13.4S, v13.4S,v8.s[2] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -trn1 v13.4S, v20.4S, v26.4S -trn2 v12.4S, v20.4S, v26.4S -trn1 v10.4S, v17.4S, v16.4S -trn2 v25.4S, v17.4S, v16.4S -trn2 v17.2D, v13.2D, v10.2D -trn2 v16.2D, v12.2D, v25.2D -trn1 v20.2D, v13.2D, v10.2D -trn1 v26.2D, v12.2D, v25.2D -sqrdmulh v25.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v27.4S -mla v17.4S, v25.4S, v31.s[0] -sub v25.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v29.4S -mul v16.4S, v16.4S,v27.4S -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v30.4S -mul v26.4S, v26.4S,v1.4S -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v20.4s, v26.4s -add v20.4s, v20.4s, v26.4s -sqrdmulh v26.4S, v17.4S, v15.4S -mul v17.4S, v17.4S,v5.4S -mla v17.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v17.4s -add v25.4s, v25.4s, v17.4s -str q20, [x0, #128] -str q16, [x0, #144] -str q25, [x0, #160] -str q26, [x0, #176] -ldr q26, [x17, #+512] -ldr q25, [x17, #+528] -ldr q16, [x17, #+544] -ldr q20, [x17, #+560] -ldr q17, [x17, #+576] -ldr q12, [x17, #+592] -ldr q10, [x17, #+608] -ldr q13, [x17, #+624] -ldr q15, [x0, #224] -ldr q5, [x0, #240] -ldr q30, [x0, #192] -ldr q1, [x0, #208] -sqrdmulh v29.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v25.s[0] -mul v5.4S, v5.4S,v26.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v5.4s -add v1.4s, v1.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v25.s[1] -mul v1.4S, v1.4S,v26.s[1] -mla v1.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v1.4s -add v30.4s, v30.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[2] -mul v15.4S, v15.4S,v26.s[2] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -trn1 v15.4S, v30.4S, v5.4S -trn2 v27.4S, v30.4S, v5.4S -trn1 v4.4S, v29.4S, v1.4S -trn2 v8.4S, v29.4S, v1.4S -trn2 v29.2D, v15.2D, v4.2D -trn2 v1.2D, v27.2D, v8.2D -trn1 v30.2D, v15.2D, v4.2D -trn1 v5.2D, v27.2D, v8.2D -sqrdmulh v8.4S, v29.4S, v20.4S -mul v29.4S, v29.4S,v16.4S -mla v29.4S, v8.4S, v31.s[0] -sub v8.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v1.4S, v20.4S -mul v1.4S, v1.4S,v16.4S -mla v1.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v1.4s -add v5.4s, v5.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v12.4S -mul v5.4S, v5.4S,v17.4S -mla v5.4S, v1.4S, v31.s[0] -sub v1.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v29.4S, v13.4S -mul v29.4S, v29.4S,v10.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -str q30, [x0, #192] -str q1, [x0, #208] -str q8, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q8, [x17, #+656] -ldr q1, [x17, #+672] -ldr q30, [x17, #+688] -ldr q29, [x17, #+704] -ldr q27, [x17, #+720] -ldr q4, [x17, #+736] -ldr q15, [x17, #+752] -ldr q13, [x0, #288] -ldr q10, [x0, #304] -ldr q12, [x0, #256] -ldr q17, [x0, #272] -sqrdmulh v20.4S, v13.4S, v8.s[0] -mul v13.4S, v13.4S,v5.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v8.s[0] -mul v10.4S, v10.4S,v5.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v10.4s -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v8.s[1] -mul v17.4S, v17.4S,v5.s[1] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v8.s[2] -mul v13.4S, v13.4S,v5.s[2] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -trn1 v13.4S, v12.4S, v10.4S -trn2 v16.4S, v12.4S, v10.4S -trn1 v25.4S, v20.4S, v17.4S -trn2 v26.4S, v20.4S, v17.4S -trn2 v20.2D, v13.2D, v25.2D -trn2 v17.2D, v16.2D, v26.2D -trn1 v12.2D, v13.2D, v25.2D -trn1 v10.2D, v16.2D, v26.2D -sqrdmulh v26.4S, v20.4S, v30.4S -mul v20.4S, v20.4S,v1.4S -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v12.4s, v20.4s -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v1.4S -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v27.4S -mul v10.4S, v10.4S,v29.4S -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v10.4s -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v20.4S, v15.4S -mul v20.4S, v20.4S,v4.4S -mla v20.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -str q12, [x0, #256] -str q17, [x0, #272] -str q26, [x0, #288] -str q10, [x0, #304] -ldr q10, [x17, #+768] -ldr q26, [x17, #+784] -ldr q17, [x17, #+800] -ldr q12, [x17, #+816] -ldr q20, [x17, #+832] -ldr q16, [x17, #+848] -ldr q25, [x17, #+864] -ldr q13, [x17, #+880] -ldr q15, [x0, #352] -ldr q4, [x0, #368] -ldr q27, [x0, #320] -ldr q29, [x0, #336] -sqrdmulh v30.4S, v15.4S, v26.s[0] -mul v15.4S, v15.4S,v10.s[0] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v15.4s -add v27.4s, v27.4s, v15.4s -sqrdmulh v15.4S, v4.4S, v26.s[0] -mul v4.4S, v4.4S,v10.s[0] -mla v4.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v26.s[1] -mul v29.4S, v29.4S,v10.s[1] -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v27.4s, v29.4s -add v27.4s, v27.4s, v29.4s -sqrdmulh v29.4S, v15.4S, v26.s[2] -mul v15.4S, v15.4S,v10.s[2] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -trn1 v15.4S, v27.4S, v4.4S -trn2 v1.4S, v27.4S, v4.4S -trn1 v8.4S, v30.4S, v29.4S -trn2 v5.4S, v30.4S, v29.4S -trn2 v30.2D, v15.2D, v8.2D -trn2 v29.2D, v1.2D, v5.2D -trn1 v27.2D, v15.2D, v8.2D -trn1 v4.2D, v1.2D, v5.2D -sqrdmulh v5.4S, v30.4S, v12.4S -mul v30.4S, v30.4S,v17.4S -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v4.4s, v29.4s -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v16.4S -mul v4.4S, v4.4S,v20.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v13.4S -mul v30.4S, v30.4S,v25.4S -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -str q27, [x0, #320] -str q29, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q29, [x17, #+928] -ldr q27, [x17, #+944] -ldr q30, [x17, #+960] -ldr q1, [x17, #+976] -ldr q8, [x17, #+992] -ldr q15, [x17, #+1008] -ldr q13, [x0, #416] -ldr q25, [x0, #432] -ldr q16, [x0, #384] -ldr q20, [x0, #400] -sqrdmulh v12.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v25.4s -add v20.4s, v20.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v5.s[1] -mul v20.4S, v20.4S,v4.s[1] -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -trn1 v13.4S, v16.4S, v25.4S -trn2 v17.4S, v16.4S, v25.4S -trn1 v26.4S, v12.4S, v20.4S -trn2 v10.4S, v12.4S, v20.4S -trn2 v12.2D, v13.2D, v26.2D -trn2 v20.2D, v17.2D, v10.2D -trn1 v16.2D, v13.2D, v26.2D -trn1 v25.2D, v17.2D, v10.2D -sqrdmulh v10.4S, v12.4S, v27.4S -mul v12.4S, v12.4S,v29.4S -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v27.4S -mul v20.4S, v20.4S,v29.4S -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v25.4s, v20.4s -add v25.4s, v25.4s, v20.4s -sqrdmulh v20.4S, v25.4S, v1.4S -mul v25.4S, v25.4S,v30.4S -mla v25.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -sqrdmulh v25.4S, v12.4S, v15.4S -mul v12.4S, v12.4S,v8.4S -mla v12.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -str q16, [x0, #384] -str q20, [x0, #400] -str q10, [x0, #416] -str q25, [x0, #432] -ldr q25, [x17, #+1024] -ldr q10, [x17, #+1040] -ldr q20, [x17, #+1056] -ldr q16, [x17, #+1072] -ldr q12, [x17, #+1088] -ldr q17, [x17, #+1104] -ldr q26, [x17, #+1120] -ldr q13, [x17, #+1136] -ldr q15, [x0, #480] -ldr q8, [x0, #496] -ldr q1, [x0, #448] -ldr q30, [x0, #464] -sqrdmulh v27.4S, v15.4S, v10.s[0] -mul v15.4S, v15.4S,v25.s[0] -mla v15.4S, v27.4S, v31.s[0] -sub v27.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v10.s[0] -mul v8.4S, v8.4S,v25.s[0] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v30.4S, v10.s[1] -mul v30.4S, v30.4S,v25.s[1] -mla v30.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -sqrdmulh v30.4S, v15.4S, v10.s[2] -mul v15.4S, v15.4S,v25.s[2] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v15.4s -add v27.4s, v27.4s, v15.4s -trn1 v15.4S, v1.4S, v8.4S -trn2 v29.4S, v1.4S, v8.4S -trn1 v5.4S, v27.4S, v30.4S -trn2 v4.4S, v27.4S, v30.4S -trn2 v27.2D, v15.2D, v5.2D -trn2 v30.2D, v29.2D, v4.2D -trn1 v1.2D, v15.2D, v5.2D -trn1 v8.2D, v29.2D, v4.2D -sqrdmulh v4.4S, v27.4S, v16.4S -mul v27.4S, v27.4S,v20.4S -mla v27.4S, v4.4S, v31.s[0] -sub v4.4s, v1.4s, v27.4s -add v1.4s, v1.4s, v27.4s -sqrdmulh v27.4S, v30.4S, v16.4S -mul v30.4S, v30.4S,v20.4S -mla v30.4S, v27.4S, v31.s[0] -sub v27.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v8.4S, v17.4S -mul v8.4S, v8.4S,v12.4S -mla v8.4S, v30.4S, v31.s[0] -sub v30.4s, v1.4s, v8.4s -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v27.4S, v13.4S -mul v27.4S, v27.4S,v26.4S -mla v27.4S, v8.4S, v31.s[0] -sub v8.4s, v4.4s, v27.4s -add v4.4s, v4.4s, v27.4s -str q1, [x0, #448] -str q30, [x0, #464] -str q4, [x0, #480] -str q8, [x0, #496] -ldr q8, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q30, [x17, #+1184] -ldr q1, [x17, #+1200] -ldr q27, [x17, #+1216] -ldr q29, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q15, [x17, #+1264] -ldr q13, [x0, #544] -ldr q26, [x0, #560] -ldr q17, [x0, #512] -ldr q12, [x0, #528] -sqrdmulh v16.4S, v13.4S, v4.s[0] -mul v13.4S, v13.4S,v8.s[0] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v4.s[0] -mul v26.4S, v26.4S,v8.s[0] -mla v26.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v26.4s -add v12.4s, v12.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v4.s[1] -mul v12.4S, v12.4S,v8.s[1] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v4.s[2] -mul v13.4S, v13.4S,v8.s[2] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -trn1 v13.4S, v17.4S, v26.4S -trn2 v20.4S, v17.4S, v26.4S -trn1 v10.4S, v16.4S, v12.4S -trn2 v25.4S, v16.4S, v12.4S -trn2 v16.2D, v13.2D, v10.2D -trn2 v12.2D, v20.2D, v25.2D -trn1 v17.2D, v13.2D, v10.2D -trn1 v26.2D, v20.2D, v25.2D -sqrdmulh v25.4S, v16.4S, v1.4S -mul v16.4S, v16.4S,v30.4S -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v1.4S -mul v12.4S, v12.4S,v30.4S -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v29.4S -mul v26.4S, v26.4S,v27.4S -mla v26.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v26.4s -add v17.4s, v17.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v15.4S -mul v16.4S, v16.4S,v5.4S -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v16.4s -add v25.4s, v25.4s, v16.4s -str q17, [x0, #512] -str q12, [x0, #528] -str q25, [x0, #544] -str q26, [x0, #560] -ldr q26, [x17, #+1280] -ldr q25, [x17, #+1296] -ldr q12, [x17, #+1312] -ldr q17, [x17, #+1328] -ldr q16, [x17, #+1344] -ldr q20, [x17, #+1360] -ldr q10, [x17, #+1376] -ldr q13, [x17, #+1392] -ldr q15, [x0, #608] -ldr q5, [x0, #624] -ldr q29, [x0, #576] -ldr q27, [x0, #592] -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v25.s[0] -mul v5.4S, v5.4S,v26.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v27.4s, v5.4s -add v27.4s, v27.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v25.s[1] -mul v27.4S, v27.4S,v26.s[1] -mla v27.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v25.s[2] -mul v15.4S, v15.4S,v26.s[2] -mla v15.4S, v27.4S, v31.s[0] -sub v27.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -trn1 v15.4S, v29.4S, v5.4S -trn2 v30.4S, v29.4S, v5.4S -trn1 v4.4S, v1.4S, v27.4S -trn2 v8.4S, v1.4S, v27.4S -trn2 v1.2D, v15.2D, v4.2D -trn2 v27.2D, v30.2D, v8.2D -trn1 v29.2D, v15.2D, v4.2D -trn1 v5.2D, v30.2D, v8.2D -sqrdmulh v8.4S, v1.4S, v17.4S -mul v1.4S, v1.4S,v12.4S -mla v1.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sqrdmulh v1.4S, v27.4S, v17.4S -mul v27.4S, v27.4S,v12.4S -mla v27.4S, v1.4S, v31.s[0] -sub v1.4s, v5.4s, v27.4s -add v5.4s, v5.4s, v27.4s -sqrdmulh v27.4S, v5.4S, v20.4S -mul v5.4S, v5.4S,v16.4S -mla v5.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v5.4s -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v13.4S -mul v1.4S, v1.4S,v10.4S -mla v1.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q29, [x0, #576] -str q27, [x0, #592] -str q8, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q8, [x17, #+1424] -ldr q27, [x17, #+1440] -ldr q29, [x17, #+1456] -ldr q1, [x17, #+1472] -ldr q30, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q15, [x17, #+1520] -ldr q13, [x0, #672] -ldr q10, [x0, #688] -ldr q20, [x0, #640] -ldr q16, [x0, #656] -sqrdmulh v17.4S, v13.4S, v8.s[0] -mul v13.4S, v13.4S,v5.s[0] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v8.s[0] -mul v10.4S, v10.4S,v5.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v8.s[1] -mul v16.4S, v16.4S,v5.s[1] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v8.s[2] -mul v13.4S, v13.4S,v5.s[2] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -trn1 v13.4S, v20.4S, v10.4S -trn2 v12.4S, v20.4S, v10.4S -trn1 v25.4S, v17.4S, v16.4S -trn2 v26.4S, v17.4S, v16.4S -trn2 v17.2D, v13.2D, v25.2D -trn2 v16.2D, v12.2D, v26.2D -trn1 v20.2D, v13.2D, v25.2D -trn1 v10.2D, v12.2D, v26.2D -sqrdmulh v26.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v27.4S -mla v17.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v29.4S -mul v16.4S, v16.4S,v27.4S -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v10.4S, v30.4S -mul v10.4S, v10.4S,v1.4S -mla v10.4S, v16.4S, v31.s[0] -sub v16.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v15.4S -mul v17.4S, v17.4S,v4.4S -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v17.4s -add v26.4s, v26.4s, v17.4s -str q20, [x0, #640] -str q16, [x0, #656] -str q26, [x0, #672] -str q10, [x0, #688] -ldr q10, [x17, #+1536] -ldr q26, [x17, #+1552] -ldr q16, [x17, #+1568] -ldr q20, [x17, #+1584] -ldr q17, [x17, #+1600] -ldr q12, [x17, #+1616] -ldr q25, [x17, #+1632] -ldr q13, [x17, #+1648] -ldr q15, [x0, #736] -ldr q4, [x0, #752] -ldr q30, [x0, #704] -ldr q1, [x0, #720] -sqrdmulh v29.4S, v15.4S, v26.s[0] -mul v15.4S, v15.4S,v10.s[0] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -sqrdmulh v15.4S, v4.4S, v26.s[0] -mul v4.4S, v4.4S,v10.s[0] -mla v4.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -sqrdmulh v4.4S, v1.4S, v26.s[1] -mul v1.4S, v1.4S,v10.s[1] -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v30.4s, v1.4s -add v30.4s, v30.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v26.s[2] -mul v15.4S, v15.4S,v10.s[2] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -trn1 v15.4S, v30.4S, v4.4S -trn2 v27.4S, v30.4S, v4.4S -trn1 v8.4S, v29.4S, v1.4S -trn2 v5.4S, v29.4S, v1.4S -trn2 v29.2D, v15.2D, v8.2D -trn2 v1.2D, v27.2D, v5.2D -trn1 v30.2D, v15.2D, v8.2D -trn1 v4.2D, v27.2D, v5.2D -sqrdmulh v5.4S, v29.4S, v20.4S -mul v29.4S, v29.4S,v16.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v1.4S, v20.4S -mul v1.4S, v1.4S,v16.4S -mla v1.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v1.4s -add v4.4s, v4.4s, v1.4s -sqrdmulh v1.4S, v4.4S, v12.4S -mul v4.4S, v4.4S,v17.4S -mla v4.4S, v1.4S, v31.s[0] -sub v1.4s, v30.4s, v4.4s -add v30.4s, v30.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v13.4S -mul v29.4S, v29.4S,v25.4S -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -str q30, [x0, #704] -str q1, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q1, [x17, #+1696] -ldr q30, [x17, #+1712] -ldr q29, [x17, #+1728] -ldr q27, [x17, #+1744] -ldr q8, [x17, #+1760] -ldr q15, [x17, #+1776] -ldr q13, [x0, #800] -ldr q25, [x0, #816] -ldr q12, [x0, #768] -ldr q17, [x0, #784] -sqrdmulh v20.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v17.4S, v5.s[1] -mul v17.4S, v17.4S,v4.s[1] -mla v17.4S, v25.4S, v31.s[0] -sub v25.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -trn1 v13.4S, v12.4S, v25.4S -trn2 v16.4S, v12.4S, v25.4S -trn1 v26.4S, v20.4S, v17.4S -trn2 v10.4S, v20.4S, v17.4S -trn2 v20.2D, v13.2D, v26.2D -trn2 v17.2D, v16.2D, v10.2D -trn1 v12.2D, v13.2D, v26.2D -trn1 v25.2D, v16.2D, v10.2D -sqrdmulh v10.4S, v20.4S, v30.4S -mul v20.4S, v20.4S,v1.4S -mla v20.4S, v10.4S, v31.s[0] -sub v10.4s, v12.4s, v20.4s -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v1.4S -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v17.4s -add v25.4s, v25.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v27.4S -mul v25.4S, v25.4S,v29.4S -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v25.4s -add v12.4s, v12.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v15.4S -mul v20.4S, v20.4S,v8.4S -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v20.4s -add v10.4s, v10.4s, v20.4s -str q12, [x0, #768] -str q17, [x0, #784] -str q10, [x0, #800] -str q25, [x0, #816] -ldr q25, [x17, #+1792] -ldr q10, [x17, #+1808] -ldr q17, [x17, #+1824] -ldr q12, [x17, #+1840] -ldr q20, [x17, #+1856] -ldr q16, [x17, #+1872] -ldr q26, [x17, #+1888] -ldr q13, [x17, #+1904] -ldr q15, [x0, #864] -ldr q8, [x0, #880] -ldr q27, [x0, #832] -ldr q29, [x0, #848] -sqrdmulh v30.4S, v15.4S, v10.s[0] -mul v15.4S, v15.4S,v25.s[0] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v15.4s -add v27.4s, v27.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v10.s[0] -mul v8.4S, v8.4S,v25.s[0] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v29.4S, v10.s[1] -mul v29.4S, v29.4S,v25.s[1] -mla v29.4S, v8.4S, v31.s[0] -sub v8.4s, v27.4s, v29.4s -add v27.4s, v27.4s, v29.4s -sqrdmulh v29.4S, v15.4S, v10.s[2] -mul v15.4S, v15.4S,v25.s[2] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -trn1 v15.4S, v27.4S, v8.4S -trn2 v1.4S, v27.4S, v8.4S -trn1 v5.4S, v30.4S, v29.4S -trn2 v4.4S, v30.4S, v29.4S -trn2 v30.2D, v15.2D, v5.2D -trn2 v29.2D, v1.2D, v4.2D -trn1 v27.2D, v15.2D, v5.2D -trn1 v8.2D, v1.2D, v4.2D -sqrdmulh v4.4S, v30.4S, v12.4S -mul v30.4S, v30.4S,v17.4S -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -sqrdmulh v29.4S, v8.4S, v16.4S -mul v8.4S, v8.4S,v20.4S -mla v8.4S, v29.4S, v31.s[0] -sub v29.4s, v27.4s, v8.4s -add v27.4s, v27.4s, v8.4s -sqrdmulh v8.4S, v30.4S, v13.4S -mul v30.4S, v30.4S,v26.4S -mla v30.4S, v8.4S, v31.s[0] -sub v8.4s, v4.4s, v30.4s -add v4.4s, v4.4s, v30.4s -str q27, [x0, #832] -str q29, [x0, #848] -str q4, [x0, #864] -str q8, [x0, #880] -ldr q8, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q29, [x17, #+1952] -ldr q27, [x17, #+1968] -ldr q30, [x17, #+1984] -ldr q1, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q15, [x17, #+2032] -ldr q13, [x0, #928] -ldr q26, [x0, #944] -ldr q16, [x0, #896] -ldr q20, [x0, #912] -sqrdmulh v12.4S, v13.4S, v4.s[0] -mul v13.4S, v13.4S,v8.s[0] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v4.s[0] -mul v26.4S, v26.4S,v8.s[0] -mla v26.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v26.4s -add v20.4s, v20.4s, v26.4s -sqrdmulh v26.4S, v20.4S, v4.s[1] -mul v20.4S, v20.4S,v8.s[1] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v4.s[2] -mul v13.4S, v13.4S,v8.s[2] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -trn1 v13.4S, v16.4S, v26.4S -trn2 v17.4S, v16.4S, v26.4S -trn1 v10.4S, v12.4S, v20.4S -trn2 v25.4S, v12.4S, v20.4S -trn2 v12.2D, v13.2D, v10.2D -trn2 v20.2D, v17.2D, v25.2D -trn1 v16.2D, v13.2D, v10.2D -trn1 v26.2D, v17.2D, v25.2D -sqrdmulh v25.4S, v12.4S, v27.4S -mul v12.4S, v12.4S,v29.4S -mla v12.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v27.4S -mul v20.4S, v20.4S,v29.4S -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v1.4S -mul v26.4S, v26.4S,v30.4S -mla v26.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v15.4S -mul v12.4S, v12.4S,v5.4S -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v12.4s -add v25.4s, v25.4s, v12.4s -str q16, [x0, #896] -str q20, [x0, #912] -str q25, [x0, #928] -str q26, [x0, #944] -ldr q26, [x17, #+2048] -ldr q25, [x17, #+2064] -ldr q20, [x17, #+2080] -ldr q16, [x17, #+2096] -ldr q12, [x17, #+2112] -ldr q17, [x17, #+2128] -ldr q10, [x17, #+2144] -ldr q13, [x17, #+2160] -ldr q15, [x0, #992] -ldr q5, [x0, #1008] -ldr q1, [x0, #960] -ldr q30, [x0, #976] -sqrdmulh v27.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -mla v15.4S, v27.4S, v31.s[0] -sub v27.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v25.s[0] -mul v5.4S, v5.4S,v26.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v30.4S, v25.s[1] -mul v30.4S, v30.4S,v26.s[1] -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -sqrdmulh v30.4S, v15.4S, v25.s[2] -mul v15.4S, v15.4S,v26.s[2] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v15.4s -add v27.4s, v27.4s, v15.4s -trn1 v15.4S, v1.4S, v5.4S -trn2 v29.4S, v1.4S, v5.4S -trn1 v4.4S, v27.4S, v30.4S -trn2 v8.4S, v27.4S, v30.4S -trn2 v27.2D, v15.2D, v4.2D -trn2 v30.2D, v29.2D, v8.2D -trn1 v1.2D, v15.2D, v4.2D -trn1 v5.2D, v29.2D, v8.2D -sqrdmulh v8.4S, v27.4S, v16.4S -mul v27.4S, v27.4S,v20.4S -mla v27.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v27.4s -add v1.4s, v1.4s, v27.4s -sqrdmulh v27.4S, v30.4S, v16.4S -mul v30.4S, v30.4S,v20.4S -mla v30.4S, v27.4S, v31.s[0] -sub v27.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v12.4S -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v1.4s, v5.4s -add v1.4s, v1.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v13.4S -mul v27.4S, v27.4S,v10.4S -mla v27.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v27.4s -add v8.4s, v8.4s, v27.4s -str q1, [x0, #960] -str q30, [x0, #976] -str q8, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s deleted file mode 100644 index 01a8251..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_12_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_12_0 -.global _ntt_u32_full_neon_asm_var_4_4_12_0 -ntt_u32_full_neon_asm_var_4_4_12_0: -_ntt_u32_full_neon_asm_var_4_4_12_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #928] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #992] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #800] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #864] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #544] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #608] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -ldr q18, [x0, #672] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -ldr q17, [x0, #736] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #288] -ldr q1, [x0, #352] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #32] -ldr q25, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #160] -ldr q15, [x0, #224] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v28.s[1] -mul v16.4S, v16.4S,v29.s[1] -sqrdmulh v21.4S, v3.4S, v28.s[1] -sub v14.4s, v26.4s, v20.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v30.4S, v30.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v1.4S, v1.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v2.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v3.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v27.4S, v28.s[2] -mla v30.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v0.4S, v28.s[2] -mla v1.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v23.4S, v28.s[2] -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v2.4S, v2.4S,v29.s[2] -sub v10.4s, v24.4s, v16.4s -mul v27.4S, v27.4S,v29.s[2] -add v24.4s, v24.4s, v16.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v3.4s -mla v27.4S, v21.4S, v31.s[0] -add v15.4s, v15.4s, v3.4s -mul v0.4S, v0.4S,v29.s[2] -sub v3.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v29.s[2] -add v26.4s, v26.4s, v30.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v1.4s -mla v23.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v11.s[1] -mul v10.4S, v10.4S,v17.s[1] -sqrdmulh v19.4S, v18.4S, v11.s[1] -sub v30.4s, v12.4s, v2.4s -mul v18.4S, v18.4S,v17.s[1] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v11.s[0] -sub v21.4s, v22.4s, v27.4s -mul v24.4S, v24.4S,v17.s[0] -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v11.s[0] -sub v16.4s, v14.4s, v0.4s -mul v15.4S, v15.4S,v17.s[0] -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+64] -ldr q9, [x17, #+80] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v23.4s -sqrdmulh v8.4S, v12.4S, v11.s[2] -add v13.4s, v13.4s, v23.4s -mla v18.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v22.4S, v11.s[2] -mla v24.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v30.4S, v11.s[3] -mla v15.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v21.4S, v11.s[3] -ldr q23, [x17, #+96] -ldr q7, [x17, #+112] -mul v12.4S, v12.4S,v17.s[2] -sub v6.4s, v3.4s, v10.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v10.4s -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v22.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v30.4S, v30.4S,v17.s[3] -sub v18.4s, v26.4s, v24.4s -mul v21.4S, v21.4S,v17.s[3] -add v26.4s, v26.4s, v24.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v15.4s -mla v21.4S, v27.4S, v31.s[0] -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v9.s[2] -mul v20.4S, v20.4S,v0.s[2] -sqrdmulh v27.4S, v8.4S, v9.s[3] -sub v24.4s, v14.4s, v12.4s -mul v8.4S, v8.4S,v0.s[3] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v9.s[1] -sub v19.4s, v13.4s, v22.4s -mul v2.4S, v2.4S,v0.s[1] -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[0] -sub v10.4s, v16.4s, v30.4s -mul v25.4S, v25.4S,v0.s[0] -add v16.4s, v16.4s, v30.4s -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -sqrdmulh v30.4S, v13.4S, v7.s[0] -add v1.4s, v1.4s, v21.4s -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v20.4s -sqrdmulh v21.4S, v19.4S, v7.s[1] -add v3.4s, v3.4s, v20.4s -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v20.4S, v1.4S, v7.s[2] -add v6.4s, v6.4s, v8.4s -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v2.4s -sqrdmulh v8.4S, v15.4S, v7.s[3] -add v18.4s, v18.4s, v2.4s -mul v13.4S, v13.4S,v23.s[0] -sub v2.4s, v26.4s, v25.4s -mul v19.4S, v19.4S,v23.s[1] -add v26.4s, v26.4s, v25.4s -mla v13.4S, v30.4S, v31.s[0] -str q27, [x0, #352] -mla v19.4S, v21.4S, v31.s[0] -str q3, [x0, #288] -mul v1.4S, v1.4S,v23.s[2] -str q12, [x0, #480] -mul v15.4S, v15.4S,v23.s[3] -str q6, [x0, #416] -mla v1.4S, v20.4S, v31.s[0] -str q22, [x0, #224] -mla v15.4S, v8.4S, v31.s[0] -str q18, [x0, #160] -ldr q18, [x0, #944] -sqrdmulh v8.4S, v18.4S, v28.s[0] -str q2, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -str q26, [x0, #32] -ldr q26, [x0, #1008] -sqrdmulh v2.4S, v26.4S, v28.s[0] -sub v22.4s, v14.4s, v13.4s -str q22, [x0, #608] -mul v26.4S, v26.4S,v29.s[0] -add v14.4s, v14.4s, v13.4s -ldr q13, [x0, #816] -sqrdmulh v22.4S, v13.4S, v28.s[0] -sub v20.4s, v24.4s, v19.4s -str q14, [x0, #544] -mul v13.4S, v13.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #880] -sqrdmulh v14.4S, v19.4S, v28.s[0] -sub v6.4s, v16.4s, v1.4s -str q20, [x0, #736] -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v1.4s -ldr q1, [x0, #560] -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v15.4s -str q24, [x0, #672] -sqrdmulh v24.4S, v1.4S, v28.s[0] -add v10.4s, v10.4s, v15.4s -ldr q15, [x0, #624] -mla v26.4S, v2.4S, v31.s[0] -str q6, [x0, #864] -sqrdmulh v6.4S, v15.4S, v28.s[0] -ldr q2, [x0, #688] -mla v13.4S, v22.4S, v31.s[0] -str q16, [x0, #800] -sqrdmulh v16.4S, v2.4S, v28.s[0] -ldr q22, [x0, #752] -mla v19.4S, v14.4S, v31.s[0] -str q8, [x0, #992] -sqrdmulh v8.4S, v22.4S, v28.s[0] -ldr q14, [x0, #432] -ldr q20, [x0, #496] -mul v1.4S, v1.4S,v29.s[0] -sub v12.4s, v14.4s, v18.4s -str q10, [x0, #928] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v18.4s -ldr q18, [x0, #304] -ldr q10, [x0, #368] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v26.4s -mla v15.4S, v6.4S, v31.s[0] -add v20.4s, v20.4s, v26.4s -ldr q26, [x0, #48] -ldr q6, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -sub v3.4s, v18.4s, v13.4s -mul v22.4S, v22.4S,v29.s[0] -add v18.4s, v18.4s, v13.4s -ldr q13, [x0, #176] -ldr q21, [x0, #240] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v19.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v28.s[1] -mul v14.4S, v14.4S,v29.s[1] -sqrdmulh v8.4S, v20.4S, v28.s[1] -sub v27.4s, v26.4s, v1.4s -mul v20.4S, v20.4S,v29.s[1] -add v26.4s, v26.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v28.s[1] -sub v30.4s, v6.4s, v15.4s -mul v18.4S, v18.4S,v29.s[1] -add v6.4s, v6.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v28.s[1] -sub v25.4s, v13.4s, v2.4s -mul v10.4S, v10.4S,v29.s[1] -add v13.4s, v13.4s, v2.4s -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v22.4s -sqrdmulh v2.4S, v12.4S, v28.s[2] -add v21.4s, v21.4s, v22.4s -mla v20.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v24.4S, v28.s[2] -mla v18.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v3.4S, v28.s[2] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v16.4S, v28.s[2] -mul v12.4S, v12.4S,v29.s[2] -sub v22.4s, v13.4s, v14.4s -mul v24.4S, v24.4S,v29.s[2] -add v13.4s, v13.4s, v14.4s -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v20.4s -mla v24.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -mul v3.4S, v3.4S,v29.s[2] -sub v20.4s, v26.4s, v18.4s -mul v16.4S, v16.4S,v29.s[2] -add v26.4s, v26.4s, v18.4s -mla v3.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v11.s[1] -mul v22.4S, v22.4S,v17.s[1] -sqrdmulh v15.4S, v2.4S, v11.s[1] -sub v18.4s, v25.4s, v12.4s -mul v2.4S, v2.4S,v17.s[1] -add v25.4s, v25.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v11.s[0] -sub v8.4s, v19.4s, v24.4s -mul v13.4S, v13.4S,v17.s[0] -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v11.s[0] -sub v14.4s, v27.4s, v3.4s -mul v21.4S, v21.4S,v17.s[0] -add v27.4s, v27.4s, v3.4s -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v16.4s -sqrdmulh v3.4S, v25.4S, v11.s[2] -add v30.4s, v30.4s, v16.4s -mla v2.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v19.4S, v11.s[2] -mla v13.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v18.4S, v11.s[3] -mla v21.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v8.4S, v11.s[3] -mul v25.4S, v25.4S,v17.s[2] -sub v16.4s, v20.4s, v22.4s -mul v19.4S, v19.4S,v17.s[2] -add v20.4s, v20.4s, v22.4s -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v2.4s -mla v19.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v2.4s -mul v18.4S, v18.4S,v17.s[3] -sub v2.4s, v26.4s, v13.4s -mul v8.4S, v8.4S,v17.s[3] -add v26.4s, v26.4s, v13.4s -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v21.4s -mla v8.4S, v24.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v9.s[2] -mul v1.4S, v1.4S,v0.s[2] -sqrdmulh v24.4S, v3.4S, v9.s[3] -sub v13.4s, v27.4s, v25.4s -mul v3.4S, v3.4S,v0.s[3] -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v12.4S, v9.s[1] -sub v15.4s, v30.4s, v19.4s -mul v12.4S, v12.4S,v0.s[1] -add v30.4s, v30.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[0] -sub v22.4s, v14.4s, v18.4s -mul v6.4S, v6.4S,v0.s[0] -add v14.4s, v14.4s, v18.4s -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v8.4s -sqrdmulh v18.4S, v30.4S, v7.s[0] -add v10.4s, v10.4s, v8.4s -mla v3.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v1.4s -sqrdmulh v8.4S, v15.4S, v7.s[1] -add v20.4s, v20.4s, v1.4s -mla v12.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v3.4s -sqrdmulh v1.4S, v10.4S, v7.s[2] -add v16.4s, v16.4s, v3.4s -mla v6.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v12.4s -sqrdmulh v3.4S, v21.4S, v7.s[3] -add v2.4s, v2.4s, v12.4s -mul v30.4S, v30.4S,v23.s[0] -sub v12.4s, v26.4s, v6.4s -mul v15.4S, v15.4S,v23.s[1] -add v26.4s, v26.4s, v6.4s -mla v30.4S, v18.4S, v31.s[0] -str q24, [x0, #368] -mla v15.4S, v8.4S, v31.s[0] -str q20, [x0, #304] -mul v10.4S, v10.4S,v23.s[2] -str q25, [x0, #496] -mul v21.4S, v21.4S,v23.s[3] -str q16, [x0, #432] -mla v10.4S, v1.4S, v31.s[0] -str q19, [x0, #240] -mla v21.4S, v3.4S, v31.s[0] -str q2, [x0, #176] -ldr q2, [x0, #896] -sqrdmulh v3.4S, v2.4S, v28.s[0] -str q12, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -str q26, [x0, #48] -ldr q26, [x0, #960] -sqrdmulh v12.4S, v26.4S, v28.s[0] -sub v19.4s, v27.4s, v30.4s -str q19, [x0, #624] -mul v26.4S, v26.4S,v29.s[0] -add v27.4s, v27.4s, v30.4s -ldr q30, [x0, #768] -sqrdmulh v19.4S, v30.4S, v28.s[0] -sub v1.4s, v13.4s, v15.4s -str q27, [x0, #560] -mul v30.4S, v30.4S,v29.s[0] -add v13.4s, v13.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v27.4S, v15.4S, v28.s[0] -sub v16.4s, v14.4s, v10.4s -str q1, [x0, #752] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v10.4s -ldr q10, [x0, #512] -mla v2.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v21.4s -str q13, [x0, #688] -sqrdmulh v13.4S, v10.4S, v28.s[0] -add v22.4s, v22.4s, v21.4s -ldr q21, [x0, #576] -mla v26.4S, v12.4S, v31.s[0] -str q16, [x0, #880] -sqrdmulh v16.4S, v21.4S, v28.s[0] -ldr q12, [x0, #640] -mla v30.4S, v19.4S, v31.s[0] -str q14, [x0, #816] -sqrdmulh v14.4S, v12.4S, v28.s[0] -ldr q19, [x0, #704] -mla v15.4S, v27.4S, v31.s[0] -str q3, [x0, #1008] -sqrdmulh v3.4S, v19.4S, v28.s[0] -ldr q27, [x0, #384] -ldr q1, [x0, #448] -mul v10.4S, v10.4S,v29.s[0] -sub v25.4s, v27.4s, v2.4s -str q22, [x0, #944] -mul v21.4S, v21.4S,v29.s[0] -add v27.4s, v27.4s, v2.4s -ldr q2, [x0, #256] -ldr q22, [x0, #320] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v26.4s -mla v21.4S, v16.4S, v31.s[0] -add v1.4s, v1.4s, v26.4s -ldr q26, [x0, #0] -ldr q16, [x0, #64] -mul v12.4S, v12.4S,v29.s[0] -sub v20.4s, v2.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v2.4s, v2.4s, v30.4s -ldr q30, [x0, #128] -ldr q8, [x0, #192] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -mla v19.4S, v3.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v27.4S, v28.s[1] -mul v27.4S, v27.4S,v29.s[1] -sqrdmulh v3.4S, v1.4S, v28.s[1] -sub v24.4s, v26.4s, v10.4s -mul v1.4S, v1.4S,v29.s[1] -add v26.4s, v26.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v28.s[1] -sub v18.4s, v16.4s, v21.4s -mul v2.4S, v2.4S,v29.s[1] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v28.s[1] -sub v6.4s, v30.4s, v12.4s -mul v22.4S, v22.4S,v29.s[1] -add v30.4s, v30.4s, v12.4s -mla v27.4S, v15.4S, v31.s[0] -sub v15.4s, v8.4s, v19.4s -sqrdmulh v12.4S, v25.4S, v28.s[2] -add v8.4s, v8.4s, v19.4s -mla v1.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v13.4S, v28.s[2] -mla v2.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v20.4S, v28.s[2] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v14.4S, v28.s[2] -mul v25.4S, v25.4S,v29.s[2] -sub v19.4s, v30.4s, v27.4s -mul v13.4S, v13.4S,v29.s[2] -add v30.4s, v30.4s, v27.4s -mla v25.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v1.4s -mla v13.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v1.4s -mul v20.4S, v20.4S,v29.s[2] -sub v1.4s, v26.4s, v2.4s -mul v14.4S, v14.4S,v29.s[2] -add v26.4s, v26.4s, v2.4s -mla v20.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v22.4s -mla v14.4S, v21.4S, v31.s[0] -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v11.s[1] -mul v19.4S, v19.4S,v17.s[1] -sqrdmulh v21.4S, v12.4S, v11.s[1] -sub v2.4s, v6.4s, v25.4s -mul v12.4S, v12.4S,v17.s[1] -add v6.4s, v6.4s, v25.4s -sqrdmulh v25.4S, v30.4S, v11.s[0] -sub v3.4s, v15.4s, v13.4s -mul v30.4S, v30.4S,v17.s[0] -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v11.s[0] -sub v27.4s, v24.4s, v20.4s -mul v8.4S, v8.4S,v17.s[0] -add v24.4s, v24.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v14.4s -sqrdmulh v20.4S, v6.4S, v11.s[2] -add v18.4s, v18.4s, v14.4s -mla v12.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v11.s[2] -mla v30.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v2.4S, v11.s[3] -mla v8.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v3.4S, v11.s[3] -mul v6.4S, v6.4S,v17.s[2] -sub v14.4s, v1.4s, v19.4s -mul v15.4S, v15.4S,v17.s[2] -add v1.4s, v1.4s, v19.4s -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v10.4s, v12.4s -mla v15.4S, v21.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -mul v2.4S, v2.4S,v17.s[3] -sub v12.4s, v26.4s, v30.4s -mul v3.4S, v3.4S,v17.s[3] -add v26.4s, v26.4s, v30.4s -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v8.4s -mla v3.4S, v13.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v9.s[2] -mul v10.4S, v10.4S,v0.s[2] -sqrdmulh v13.4S, v20.4S, v9.s[3] -sub v30.4s, v24.4s, v6.4s -mul v20.4S, v20.4S,v0.s[3] -add v24.4s, v24.4s, v6.4s -sqrdmulh v6.4S, v25.4S, v9.s[1] -sub v21.4s, v18.4s, v15.4s -mul v25.4S, v25.4S,v0.s[1] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v9.s[0] -sub v19.4s, v27.4s, v2.4s -mul v16.4S, v16.4S,v0.s[0] -add v27.4s, v27.4s, v2.4s -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v3.4s -sqrdmulh v2.4S, v18.4S, v7.s[0] -add v22.4s, v22.4s, v3.4s -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v10.4s -sqrdmulh v3.4S, v21.4S, v7.s[1] -add v1.4s, v1.4s, v10.4s -mla v25.4S, v6.4S, v31.s[0] -sub v6.4s, v14.4s, v20.4s -sqrdmulh v10.4S, v22.4S, v7.s[2] -add v14.4s, v14.4s, v20.4s -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v25.4s -sqrdmulh v20.4S, v8.4S, v7.s[3] -add v12.4s, v12.4s, v25.4s -mul v18.4S, v18.4S,v23.s[0] -sub v25.4s, v26.4s, v16.4s -mul v21.4S, v21.4S,v23.s[1] -add v26.4s, v26.4s, v16.4s -mla v18.4S, v2.4S, v31.s[0] -str q13, [x0, #320] -mla v21.4S, v3.4S, v31.s[0] -str q1, [x0, #256] -mul v22.4S, v22.4S,v23.s[2] -str q6, [x0, #448] -mul v8.4S, v8.4S,v23.s[3] -str q14, [x0, #384] -mla v22.4S, v10.4S, v31.s[0] -str q15, [x0, #192] -mla v8.4S, v20.4S, v31.s[0] -str q12, [x0, #128] -ldr q12, [x0, #912] -sqrdmulh v20.4S, v12.4S, v28.s[0] -str q25, [x0, #64] -mul v12.4S, v12.4S,v29.s[0] -str q26, [x0, #0] -ldr q26, [x0, #976] -sqrdmulh v25.4S, v26.4S, v28.s[0] -sub v15.4s, v24.4s, v18.4s -str q15, [x0, #576] -mul v26.4S, v26.4S,v29.s[0] -add v24.4s, v24.4s, v18.4s -ldr q18, [x0, #784] -sqrdmulh v15.4S, v18.4S, v28.s[0] -sub v10.4s, v30.4s, v21.4s -str q24, [x0, #512] -mul v18.4S, v18.4S,v29.s[0] -add v30.4s, v30.4s, v21.4s -ldr q21, [x0, #848] -sqrdmulh v24.4S, v21.4S, v28.s[0] -sub v14.4s, v27.4s, v22.4s -str q10, [x0, #704] -mul v21.4S, v21.4S,v29.s[0] -add v27.4s, v27.4s, v22.4s -ldr q22, [x0, #528] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v8.4s -str q30, [x0, #640] -sqrdmulh v30.4S, v22.4S, v28.s[0] -add v19.4s, v19.4s, v8.4s -ldr q8, [x0, #592] -mla v26.4S, v25.4S, v31.s[0] -str q14, [x0, #832] -sqrdmulh v14.4S, v8.4S, v28.s[0] -ldr q25, [x0, #656] -mla v18.4S, v15.4S, v31.s[0] -str q27, [x0, #768] -sqrdmulh v27.4S, v25.4S, v28.s[0] -ldr q15, [x0, #720] -mla v21.4S, v24.4S, v31.s[0] -str q20, [x0, #960] -sqrdmulh v20.4S, v15.4S, v28.s[0] -ldr q24, [x0, #400] -ldr q10, [x0, #464] -mul v22.4S, v22.4S,v29.s[0] -sub v6.4s, v24.4s, v12.4s -str q19, [x0, #896] -mul v8.4S, v8.4S,v29.s[0] -add v24.4s, v24.4s, v12.4s -ldr q12, [x0, #272] -ldr q19, [x0, #336] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v26.4s -mla v8.4S, v14.4S, v31.s[0] -add v10.4s, v10.4s, v26.4s -ldr q26, [x0, #16] -ldr q14, [x0, #80] -mul v25.4S, v25.4S,v29.s[0] -sub v1.4s, v12.4s, v18.4s -mul v15.4S, v15.4S,v29.s[0] -add v12.4s, v12.4s, v18.4s -ldr q18, [x0, #144] -ldr q3, [x0, #208] -mla v25.4S, v27.4S, v31.s[0] -sub v27.4s, v19.4s, v21.4s -mla v15.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v24.4S, v28.s[1] -mul v24.4S, v24.4S,v29.s[1] -sqrdmulh v20.4S, v10.4S, v28.s[1] -sub v13.4s, v26.4s, v22.4s -mul v10.4S, v10.4S,v29.s[1] -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v28.s[1] -sub v2.4s, v14.4s, v8.4s -mul v12.4S, v12.4S,v29.s[1] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v28.s[1] -sub v16.4s, v18.4s, v25.4s -mul v19.4S, v19.4S,v29.s[1] -add v18.4s, v18.4s, v25.4s -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v15.4s -sqrdmulh v25.4S, v6.4S, v28.s[2] -add v3.4s, v3.4s, v15.4s -mla v10.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v30.4S, v28.s[2] -mla v12.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v1.4S, v28.s[2] -mla v19.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v27.4S, v28.s[2] -mul v6.4S, v6.4S,v29.s[2] -sub v15.4s, v18.4s, v24.4s -mul v30.4S, v30.4S,v29.s[2] -add v18.4s, v18.4s, v24.4s -mla v6.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v10.4s -mla v30.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -mul v1.4S, v1.4S,v29.s[2] -sub v10.4s, v26.4s, v12.4s -mul v27.4S, v27.4S,v29.s[2] -add v26.4s, v26.4s, v12.4s -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v19.4s -mla v27.4S, v8.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v28.4S, v15.4S, v11.s[1] -mul v15.4S, v15.4S,v17.s[1] -sqrdmulh v29.4S, v25.4S, v11.s[1] -sub v19.4s, v16.4s, v6.4s -mul v25.4S, v25.4S,v17.s[1] -add v16.4s, v16.4s, v6.4s -sqrdmulh v6.4S, v18.4S, v11.s[0] -sub v8.4s, v21.4s, v30.4s -mul v18.4S, v18.4S,v17.s[0] -add v21.4s, v21.4s, v30.4s -sqrdmulh v30.4S, v3.4S, v11.s[0] -sub v12.4s, v13.4s, v1.4s -mul v3.4S, v3.4S,v17.s[0] -add v13.4s, v13.4s, v1.4s -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v2.4s, v27.4s -sqrdmulh v1.4S, v16.4S, v11.s[2] -add v2.4s, v2.4s, v27.4s -mla v25.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v21.4S, v11.s[2] -mla v18.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v19.4S, v11.s[3] -mla v3.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v8.4S, v11.s[3] -mul v16.4S, v16.4S,v17.s[2] -sub v27.4s, v10.4s, v15.4s -mul v21.4S, v21.4S,v17.s[2] -add v10.4s, v10.4s, v15.4s -mla v16.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v25.4s -mla v21.4S, v29.4S, v31.s[0] -add v22.4s, v22.4s, v25.4s -mul v19.4S, v19.4S,v17.s[3] -sub v25.4s, v26.4s, v18.4s -mul v8.4S, v8.4S,v17.s[3] -add v26.4s, v26.4s, v18.4s -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v14.4s, v3.4s -mla v8.4S, v30.4S, v31.s[0] -add v14.4s, v14.4s, v3.4s -sqrdmulh v11.4S, v22.4S, v9.s[2] -mul v22.4S, v22.4S,v0.s[2] -sqrdmulh v17.4S, v1.4S, v9.s[3] -sub v3.4s, v13.4s, v16.4s -mul v1.4S, v1.4S,v0.s[3] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v9.s[1] -sub v30.4s, v2.4s, v21.4s -mul v6.4S, v6.4S,v0.s[1] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v9.s[0] -sub v18.4s, v12.4s, v19.4s -mul v14.4S, v14.4S,v0.s[0] -add v12.4s, v12.4s, v19.4s -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v8.4s -sqrdmulh v9.4S, v2.4S, v7.s[0] -add v28.4s, v28.4s, v8.4s -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v22.4s -sqrdmulh v8.4S, v30.4S, v7.s[1] -add v10.4s, v10.4s, v22.4s -mla v6.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v1.4s -sqrdmulh v22.4S, v28.4S, v7.s[2] -add v27.4s, v27.4s, v1.4s -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v25.4s, v6.4s -sqrdmulh v1.4S, v11.4S, v7.s[3] -add v25.4s, v25.4s, v6.4s -mul v2.4S, v2.4S,v23.s[0] -sub v6.4s, v26.4s, v14.4s -mul v30.4S, v30.4S,v23.s[1] -add v26.4s, v26.4s, v14.4s -mla v2.4S, v9.4S, v31.s[0] -str q17, [x0, #336] -mla v30.4S, v8.4S, v31.s[0] -str q10, [x0, #272] -mul v28.4S, v28.4S,v23.s[2] -str q16, [x0, #464] -mul v11.4S, v11.4S,v23.s[3] -str q27, [x0, #400] -mla v28.4S, v22.4S, v31.s[0] -str q21, [x0, #208] -mla v11.4S, v1.4S, v31.s[0] -str q25, [x0, #144] -str q6, [x0, #80] -str q26, [x0, #16] -sub v26.4s, v13.4s, v2.4s -str q26, [x0, #592] -add v13.4s, v13.4s, v2.4s -sub v2.4s, v3.4s, v30.4s -str q13, [x0, #528] -add v3.4s, v3.4s, v30.4s -sub v30.4s, v12.4s, v28.4s -str q2, [x0, #720] -add v12.4s, v12.4s, v28.4s -sub v28.4s, v18.4s, v11.4s -str q3, [x0, #656] -add v18.4s, v18.4s, v11.4s -str q30, [x0, #848] -str q12, [x0, #784] -str q28, [x0, #976] -str q18, [x0, #912] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q24, [x17, #+160] -ldr q20, [x17, #+176] -ldr q15, [x17, #+192] -ldr q29, [x17, #+208] -ldr q19, [x17, #+224] -ldr q0, [x17, #+240] -ldr q14, [x0, #32] -ldr q9, [x0, #48] -ldr q17, [x0, #0] -ldr q8, [x0, #16] -sqrdmulh v10.4S, v14.4S, v5.s[0] -mul v14.4S, v14.4S,v4.s[0] -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v5.s[1] -mul v8.4S, v8.4S,v4.s[1] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -sqrdmulh v8.4S, v14.4S, v5.s[2] -mul v14.4S, v14.4S,v4.s[2] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -trn1 v14.4S, v17.4S, v9.4S -trn2 v16.4S, v17.4S, v9.4S -trn1 v27.4S, v10.4S, v8.4S -trn2 v22.4S, v10.4S, v8.4S -trn2 v10.2D, v14.2D, v27.2D -trn2 v8.2D, v16.2D, v22.2D -trn1 v17.2D, v14.2D, v27.2D -trn1 v9.2D, v16.2D, v22.2D -sqrdmulh v22.4S, v10.4S, v20.4S -mul v10.4S, v10.4S,v24.4S -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v17.4s, v10.4s -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v20.4S -mul v8.4S, v8.4S,v24.4S -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v15.4S -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v19.4S -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -str q17, [x0, #0] -str q8, [x0, #16] -str q22, [x0, #32] -str q9, [x0, #48] -ldr q9, [x17, #+256] -ldr q22, [x17, #+272] -ldr q8, [x17, #+288] -ldr q17, [x17, #+304] -ldr q10, [x17, #+320] -ldr q16, [x17, #+336] -ldr q27, [x17, #+352] -ldr q14, [x17, #+368] -ldr q0, [x0, #96] -ldr q19, [x0, #112] -ldr q29, [x0, #64] -ldr q15, [x0, #80] -sqrdmulh v20.4S, v0.4S, v22.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v9.s[0] -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v15.4S, v22.s[1] -mul v15.4S, v15.4S,v9.s[1] -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v0.4S, v22.s[2] -mul v0.4S, v0.4S,v9.s[2] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -trn1 v0.4S, v29.4S, v19.4S -trn2 v24.4S, v29.4S, v19.4S -trn1 v5.4S, v20.4S, v15.4S -trn2 v4.4S, v20.4S, v15.4S -trn2 v20.2D, v0.2D, v5.2D -trn2 v15.2D, v24.2D, v4.2D -trn1 v29.2D, v0.2D, v5.2D -trn1 v19.2D, v24.2D, v4.2D -sqrdmulh v4.4S, v20.4S, v17.4S -mul v20.4S, v20.4S,v8.4S -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v20.4s -add v29.4s, v29.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v17.4S -mul v15.4S, v15.4S,v8.4S -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v16.4S -mul v19.4S, v19.4S,v10.4S -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v19.4s -add v29.4s, v29.4s, v19.4s -sqrdmulh v19.4S, v20.4S, v14.4S -mul v20.4S, v20.4S,v27.4S -mla v20.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -str q29, [x0, #64] -str q15, [x0, #80] -str q4, [x0, #96] -str q19, [x0, #112] -ldr q19, [x17, #+384] -ldr q4, [x17, #+400] -ldr q15, [x17, #+416] -ldr q29, [x17, #+432] -ldr q20, [x17, #+448] -ldr q24, [x17, #+464] -ldr q5, [x17, #+480] -ldr q0, [x17, #+496] -ldr q14, [x0, #160] -ldr q27, [x0, #176] -ldr q16, [x0, #128] -ldr q10, [x0, #144] -sqrdmulh v17.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v19.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v27.4S, v4.s[0] -mul v27.4S, v27.4S,v19.s[0] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v27.4s -add v10.4s, v10.4s, v27.4s -sqrdmulh v27.4S, v10.4S, v4.s[1] -mul v10.4S, v10.4S,v19.s[1] -mla v10.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v14.4S, v4.s[2] -mul v14.4S, v14.4S,v19.s[2] -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -trn1 v14.4S, v16.4S, v27.4S -trn2 v8.4S, v16.4S, v27.4S -trn1 v22.4S, v17.4S, v10.4S -trn2 v9.4S, v17.4S, v10.4S -trn2 v17.2D, v14.2D, v22.2D -trn2 v10.2D, v8.2D, v9.2D -trn1 v16.2D, v14.2D, v22.2D -trn1 v27.2D, v8.2D, v9.2D -sqrdmulh v9.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v15.4S -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v29.4S -mul v10.4S, v10.4S,v15.4S -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v27.4s, v10.4s -add v27.4s, v27.4s, v10.4s -sqrdmulh v10.4S, v27.4S, v24.4S -mul v27.4S, v27.4S,v20.4S -mla v27.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v0.4S -mul v17.4S, v17.4S,v5.4S -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -str q16, [x0, #128] -str q10, [x0, #144] -str q9, [x0, #160] -str q27, [x0, #176] -ldr q27, [x17, #+512] -ldr q9, [x17, #+528] -ldr q10, [x17, #+544] -ldr q16, [x17, #+560] -ldr q17, [x17, #+576] -ldr q8, [x17, #+592] -ldr q22, [x17, #+608] -ldr q14, [x17, #+624] -ldr q0, [x0, #224] -ldr q5, [x0, #240] -ldr q24, [x0, #192] -ldr q20, [x0, #208] -sqrdmulh v29.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v27.s[0] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v0.4s -add v24.4s, v24.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v9.s[1] -mul v20.4S, v20.4S,v27.s[1] -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v9.s[2] -mul v0.4S, v0.4S,v27.s[2] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -trn1 v0.4S, v24.4S, v5.4S -trn2 v15.4S, v24.4S, v5.4S -trn1 v4.4S, v29.4S, v20.4S -trn2 v19.4S, v29.4S, v20.4S -trn2 v29.2D, v0.2D, v4.2D -trn2 v20.2D, v15.2D, v19.2D -trn1 v24.2D, v0.2D, v4.2D -trn1 v5.2D, v15.2D, v19.2D -sqrdmulh v19.4S, v29.4S, v16.4S -mul v29.4S, v29.4S,v10.4S -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v24.4s, v29.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v20.4S, v16.4S -mul v20.4S, v20.4S,v10.4S -mla v20.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v5.4S, v8.4S -mul v5.4S, v5.4S,v17.4S -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v5.4s -add v24.4s, v24.4s, v5.4s -sqrdmulh v5.4S, v29.4S, v14.4S -mul v29.4S, v29.4S,v22.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -str q24, [x0, #192] -str q20, [x0, #208] -str q19, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q19, [x17, #+656] -ldr q20, [x17, #+672] -ldr q24, [x17, #+688] -ldr q29, [x17, #+704] -ldr q15, [x17, #+720] -ldr q4, [x17, #+736] -ldr q0, [x17, #+752] -ldr q14, [x0, #288] -ldr q22, [x0, #304] -ldr q8, [x0, #256] -ldr q17, [x0, #272] -sqrdmulh v16.4S, v14.4S, v19.s[0] -mul v14.4S, v14.4S,v5.s[0] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v22.4S, v19.s[0] -mul v22.4S, v22.4S,v5.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v19.s[1] -mul v17.4S, v17.4S,v5.s[1] -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v17.4s -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v19.s[2] -mul v14.4S, v14.4S,v5.s[2] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -trn1 v14.4S, v8.4S, v22.4S -trn2 v10.4S, v8.4S, v22.4S -trn1 v9.4S, v16.4S, v17.4S -trn2 v27.4S, v16.4S, v17.4S -trn2 v16.2D, v14.2D, v9.2D -trn2 v17.2D, v10.2D, v27.2D -trn1 v8.2D, v14.2D, v9.2D -trn1 v22.2D, v10.2D, v27.2D -sqrdmulh v27.4S, v16.4S, v24.4S -mul v16.4S, v16.4S,v20.4S -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v24.4S -mul v17.4S, v17.4S,v20.4S -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v15.4S -mul v22.4S, v22.4S,v29.4S -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v22.4s -add v8.4s, v8.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v0.4S -mul v16.4S, v16.4S,v4.4S -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -str q8, [x0, #256] -str q17, [x0, #272] -str q27, [x0, #288] -str q22, [x0, #304] -ldr q22, [x17, #+768] -ldr q27, [x17, #+784] -ldr q17, [x17, #+800] -ldr q8, [x17, #+816] -ldr q16, [x17, #+832] -ldr q10, [x17, #+848] -ldr q9, [x17, #+864] -ldr q14, [x17, #+880] -ldr q0, [x0, #352] -ldr q4, [x0, #368] -ldr q15, [x0, #320] -ldr q29, [x0, #336] -sqrdmulh v24.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v22.s[0] -mla v0.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v22.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v27.s[1] -mul v29.4S, v29.4S,v22.s[1] -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v22.s[2] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v0.4s -add v24.4s, v24.4s, v0.4s -trn1 v0.4S, v15.4S, v4.4S -trn2 v20.4S, v15.4S, v4.4S -trn1 v19.4S, v24.4S, v29.4S -trn2 v5.4S, v24.4S, v29.4S -trn2 v24.2D, v0.2D, v19.2D -trn2 v29.2D, v20.2D, v5.2D -trn1 v15.2D, v0.2D, v19.2D -trn1 v4.2D, v20.2D, v5.2D -sqrdmulh v5.4S, v24.4S, v8.4S -mul v24.4S, v24.4S,v17.4S -mla v24.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v24.4s -add v15.4s, v15.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v8.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v4.4s, v29.4s -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v10.4S -mul v4.4S, v4.4S,v16.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v24.4S, v14.4S -mul v24.4S, v24.4S,v9.4S -mla v24.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v24.4s -add v5.4s, v5.4s, v24.4s -str q15, [x0, #320] -str q29, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q29, [x17, #+928] -ldr q15, [x17, #+944] -ldr q24, [x17, #+960] -ldr q20, [x17, #+976] -ldr q19, [x17, #+992] -ldr q0, [x17, #+1008] -ldr q14, [x0, #416] -ldr q9, [x0, #432] -ldr q10, [x0, #384] -ldr q16, [x0, #400] -sqrdmulh v8.4S, v14.4S, v5.s[0] -mul v14.4S, v14.4S,v4.s[0] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v5.s[1] -mul v16.4S, v16.4S,v4.s[1] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v5.s[2] -mul v14.4S, v14.4S,v4.s[2] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -trn1 v14.4S, v10.4S, v9.4S -trn2 v17.4S, v10.4S, v9.4S -trn1 v27.4S, v8.4S, v16.4S -trn2 v22.4S, v8.4S, v16.4S -trn2 v8.2D, v14.2D, v27.2D -trn2 v16.2D, v17.2D, v22.2D -trn1 v10.2D, v14.2D, v27.2D -trn1 v9.2D, v17.2D, v22.2D -sqrdmulh v22.4S, v8.4S, v15.4S -mul v8.4S, v8.4S,v29.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v15.4S -mul v16.4S, v16.4S,v29.4S -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v20.4S -mul v9.4S, v9.4S,v24.4S -mla v9.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v0.4S -mul v8.4S, v8.4S,v19.4S -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v8.4s -add v22.4s, v22.4s, v8.4s -str q10, [x0, #384] -str q16, [x0, #400] -str q22, [x0, #416] -str q9, [x0, #432] -ldr q9, [x17, #+1024] -ldr q22, [x17, #+1040] -ldr q16, [x17, #+1056] -ldr q10, [x17, #+1072] -ldr q8, [x17, #+1088] -ldr q17, [x17, #+1104] -ldr q27, [x17, #+1120] -ldr q14, [x17, #+1136] -ldr q0, [x0, #480] -ldr q19, [x0, #496] -ldr q20, [x0, #448] -ldr q24, [x0, #464] -sqrdmulh v15.4S, v0.4S, v22.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v9.s[0] -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v19.4s -add v24.4s, v24.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v9.s[1] -mla v24.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v24.4s -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v0.4S, v22.s[2] -mul v0.4S, v0.4S,v9.s[2] -mla v0.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -trn1 v0.4S, v20.4S, v19.4S -trn2 v29.4S, v20.4S, v19.4S -trn1 v5.4S, v15.4S, v24.4S -trn2 v4.4S, v15.4S, v24.4S -trn2 v15.2D, v0.2D, v5.2D -trn2 v24.2D, v29.2D, v4.2D -trn1 v20.2D, v0.2D, v5.2D -trn1 v19.2D, v29.2D, v4.2D -sqrdmulh v4.4S, v15.4S, v10.4S -mul v15.4S, v15.4S,v16.4S -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v24.4S, v10.4S -mul v24.4S, v24.4S,v16.4S -mla v24.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v24.4s -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v19.4S, v17.4S -mul v19.4S, v19.4S,v8.4S -mla v19.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v15.4S, v14.4S -mul v15.4S, v15.4S,v27.4S -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v15.4s -add v4.4s, v4.4s, v15.4s -str q20, [x0, #448] -str q24, [x0, #464] -str q4, [x0, #480] -str q19, [x0, #496] -ldr q19, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q24, [x17, #+1184] -ldr q20, [x17, #+1200] -ldr q15, [x17, #+1216] -ldr q29, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q0, [x17, #+1264] -ldr q14, [x0, #544] -ldr q27, [x0, #560] -ldr q17, [x0, #512] -ldr q8, [x0, #528] -sqrdmulh v10.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v19.s[0] -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v27.4S, v4.s[0] -mul v27.4S, v27.4S,v19.s[0] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v27.4s -add v8.4s, v8.4s, v27.4s -sqrdmulh v27.4S, v8.4S, v4.s[1] -mul v8.4S, v8.4S,v19.s[1] -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -sqrdmulh v8.4S, v14.4S, v4.s[2] -mul v14.4S, v14.4S,v19.s[2] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -trn1 v14.4S, v17.4S, v27.4S -trn2 v16.4S, v17.4S, v27.4S -trn1 v22.4S, v10.4S, v8.4S -trn2 v9.4S, v10.4S, v8.4S -trn2 v10.2D, v14.2D, v22.2D -trn2 v8.2D, v16.2D, v9.2D -trn1 v17.2D, v14.2D, v22.2D -trn1 v27.2D, v16.2D, v9.2D -sqrdmulh v9.4S, v10.4S, v20.4S -mul v10.4S, v10.4S,v24.4S -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v10.4s -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v20.4S -mul v8.4S, v8.4S,v24.4S -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v27.4s, v8.4s -add v27.4s, v27.4s, v8.4s -sqrdmulh v8.4S, v27.4S, v29.4S -mul v27.4S, v27.4S,v15.4S -mla v27.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v5.4S -mla v10.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -str q17, [x0, #512] -str q8, [x0, #528] -str q9, [x0, #544] -str q27, [x0, #560] -ldr q27, [x17, #+1280] -ldr q9, [x17, #+1296] -ldr q8, [x17, #+1312] -ldr q17, [x17, #+1328] -ldr q10, [x17, #+1344] -ldr q16, [x17, #+1360] -ldr q22, [x17, #+1376] -ldr q14, [x17, #+1392] -ldr q0, [x0, #608] -ldr q5, [x0, #624] -ldr q29, [x0, #576] -ldr q15, [x0, #592] -sqrdmulh v20.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v27.s[0] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v9.s[1] -mul v15.4S, v15.4S,v27.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v0.4S, v9.s[2] -mul v0.4S, v0.4S,v27.s[2] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -trn1 v0.4S, v29.4S, v5.4S -trn2 v24.4S, v29.4S, v5.4S -trn1 v4.4S, v20.4S, v15.4S -trn2 v19.4S, v20.4S, v15.4S -trn2 v20.2D, v0.2D, v4.2D -trn2 v15.2D, v24.2D, v19.2D -trn1 v29.2D, v0.2D, v4.2D -trn1 v5.2D, v24.2D, v19.2D -sqrdmulh v19.4S, v20.4S, v17.4S -mul v20.4S, v20.4S,v8.4S -mla v20.4S, v19.4S, v31.s[0] -sub v19.4s, v29.4s, v20.4s -add v29.4s, v29.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v17.4S -mul v15.4S, v15.4S,v8.4S -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v15.4s -add v5.4s, v5.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v16.4S -mul v5.4S, v5.4S,v10.4S -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v5.4s -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v14.4S -mul v20.4S, v20.4S,v22.4S -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -str q29, [x0, #576] -str q15, [x0, #592] -str q19, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q19, [x17, #+1424] -ldr q15, [x17, #+1440] -ldr q29, [x17, #+1456] -ldr q20, [x17, #+1472] -ldr q24, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q0, [x17, #+1520] -ldr q14, [x0, #672] -ldr q22, [x0, #688] -ldr q16, [x0, #640] -ldr q10, [x0, #656] -sqrdmulh v17.4S, v14.4S, v19.s[0] -mul v14.4S, v14.4S,v5.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v22.4S, v19.s[0] -mul v22.4S, v22.4S,v5.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v19.s[1] -mul v10.4S, v10.4S,v5.s[1] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v14.4S, v19.s[2] -mul v14.4S, v14.4S,v5.s[2] -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -trn1 v14.4S, v16.4S, v22.4S -trn2 v8.4S, v16.4S, v22.4S -trn1 v9.4S, v17.4S, v10.4S -trn2 v27.4S, v17.4S, v10.4S -trn2 v17.2D, v14.2D, v9.2D -trn2 v10.2D, v8.2D, v27.2D -trn1 v16.2D, v14.2D, v9.2D -trn1 v22.2D, v8.2D, v27.2D -sqrdmulh v27.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v15.4S -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v29.4S -mul v10.4S, v10.4S,v15.4S -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v24.4S -mul v22.4S, v22.4S,v20.4S -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v22.4s -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v0.4S -mul v17.4S, v17.4S,v4.4S -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v27.4s, v17.4s -add v27.4s, v27.4s, v17.4s -str q16, [x0, #640] -str q10, [x0, #656] -str q27, [x0, #672] -str q22, [x0, #688] -ldr q22, [x17, #+1536] -ldr q27, [x17, #+1552] -ldr q10, [x17, #+1568] -ldr q16, [x17, #+1584] -ldr q17, [x17, #+1600] -ldr q8, [x17, #+1616] -ldr q9, [x17, #+1632] -ldr q14, [x17, #+1648] -ldr q0, [x0, #736] -ldr q4, [x0, #752] -ldr q24, [x0, #704] -ldr q20, [x0, #720] -sqrdmulh v29.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v22.s[0] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v0.4s -add v24.4s, v24.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v22.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v22.s[1] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v22.s[2] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -trn1 v0.4S, v24.4S, v4.4S -trn2 v15.4S, v24.4S, v4.4S -trn1 v19.4S, v29.4S, v20.4S -trn2 v5.4S, v29.4S, v20.4S -trn2 v29.2D, v0.2D, v19.2D -trn2 v20.2D, v15.2D, v5.2D -trn1 v24.2D, v0.2D, v19.2D -trn1 v4.2D, v15.2D, v5.2D -sqrdmulh v5.4S, v29.4S, v16.4S -mul v29.4S, v29.4S,v10.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v24.4s, v29.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v20.4S, v16.4S -mul v20.4S, v20.4S,v10.4S -mla v20.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v4.4S, v8.4S -mul v4.4S, v4.4S,v17.4S -mla v4.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v4.4s -add v24.4s, v24.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v14.4S -mul v29.4S, v29.4S,v9.4S -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -str q24, [x0, #704] -str q20, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q20, [x17, #+1696] -ldr q24, [x17, #+1712] -ldr q29, [x17, #+1728] -ldr q15, [x17, #+1744] -ldr q19, [x17, #+1760] -ldr q0, [x17, #+1776] -ldr q14, [x0, #800] -ldr q9, [x0, #816] -ldr q8, [x0, #768] -ldr q17, [x0, #784] -sqrdmulh v16.4S, v14.4S, v5.s[0] -mul v14.4S, v14.4S,v4.s[0] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v5.s[1] -mul v17.4S, v17.4S,v4.s[1] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v17.4s -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v5.s[2] -mul v14.4S, v14.4S,v4.s[2] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -trn1 v14.4S, v8.4S, v9.4S -trn2 v10.4S, v8.4S, v9.4S -trn1 v27.4S, v16.4S, v17.4S -trn2 v22.4S, v16.4S, v17.4S -trn2 v16.2D, v14.2D, v27.2D -trn2 v17.2D, v10.2D, v22.2D -trn1 v8.2D, v14.2D, v27.2D -trn1 v9.2D, v10.2D, v22.2D -sqrdmulh v22.4S, v16.4S, v24.4S -mul v16.4S, v16.4S,v20.4S -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v24.4S -mul v17.4S, v17.4S,v20.4S -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v15.4S -mul v9.4S, v9.4S,v29.4S -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v0.4S -mul v16.4S, v16.4S,v19.4S -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -str q8, [x0, #768] -str q17, [x0, #784] -str q22, [x0, #800] -str q9, [x0, #816] -ldr q9, [x17, #+1792] -ldr q22, [x17, #+1808] -ldr q17, [x17, #+1824] -ldr q8, [x17, #+1840] -ldr q16, [x17, #+1856] -ldr q10, [x17, #+1872] -ldr q27, [x17, #+1888] -ldr q14, [x17, #+1904] -ldr q0, [x0, #864] -ldr q19, [x0, #880] -ldr q15, [x0, #832] -ldr q29, [x0, #848] -sqrdmulh v24.4S, v0.4S, v22.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v9.s[0] -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v19.4s -add v29.4s, v29.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v22.s[1] -mul v29.4S, v29.4S,v9.s[1] -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v22.s[2] -mul v0.4S, v0.4S,v9.s[2] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v0.4s -add v24.4s, v24.4s, v0.4s -trn1 v0.4S, v15.4S, v19.4S -trn2 v20.4S, v15.4S, v19.4S -trn1 v5.4S, v24.4S, v29.4S -trn2 v4.4S, v24.4S, v29.4S -trn2 v24.2D, v0.2D, v5.2D -trn2 v29.2D, v20.2D, v4.2D -trn1 v15.2D, v0.2D, v5.2D -trn1 v19.2D, v20.2D, v4.2D -sqrdmulh v4.4S, v24.4S, v8.4S -mul v24.4S, v24.4S,v17.4S -mla v24.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v24.4s -add v15.4s, v15.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v8.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -sqrdmulh v29.4S, v19.4S, v10.4S -mul v19.4S, v19.4S,v16.4S -mla v19.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v14.4S -mul v24.4S, v24.4S,v27.4S -mla v24.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v24.4s -add v4.4s, v4.4s, v24.4s -str q15, [x0, #832] -str q29, [x0, #848] -str q4, [x0, #864] -str q19, [x0, #880] -ldr q19, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q29, [x17, #+1952] -ldr q15, [x17, #+1968] -ldr q24, [x17, #+1984] -ldr q20, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q0, [x17, #+2032] -ldr q14, [x0, #928] -ldr q27, [x0, #944] -ldr q10, [x0, #896] -ldr q16, [x0, #912] -sqrdmulh v8.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v19.s[0] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v27.4S, v4.s[0] -mul v27.4S, v27.4S,v19.s[0] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sqrdmulh v27.4S, v16.4S, v4.s[1] -mul v16.4S, v16.4S,v19.s[1] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v4.s[2] -mul v14.4S, v14.4S,v19.s[2] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -trn1 v14.4S, v10.4S, v27.4S -trn2 v17.4S, v10.4S, v27.4S -trn1 v22.4S, v8.4S, v16.4S -trn2 v9.4S, v8.4S, v16.4S -trn2 v8.2D, v14.2D, v22.2D -trn2 v16.2D, v17.2D, v9.2D -trn1 v10.2D, v14.2D, v22.2D -trn1 v27.2D, v17.2D, v9.2D -sqrdmulh v9.4S, v8.4S, v15.4S -mul v8.4S, v8.4S,v29.4S -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v15.4S -mul v16.4S, v16.4S,v29.4S -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v27.4S, v20.4S -mul v27.4S, v27.4S,v24.4S -mla v27.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v27.4s -add v10.4s, v10.4s, v27.4s -sqrdmulh v27.4S, v8.4S, v0.4S -mul v8.4S, v8.4S,v5.4S -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -str q10, [x0, #896] -str q16, [x0, #912] -str q9, [x0, #928] -str q27, [x0, #944] -ldr q27, [x17, #+2048] -ldr q9, [x17, #+2064] -ldr q16, [x17, #+2080] -ldr q10, [x17, #+2096] -ldr q8, [x17, #+2112] -ldr q17, [x17, #+2128] -ldr q22, [x17, #+2144] -ldr q14, [x17, #+2160] -ldr q0, [x0, #992] -ldr q5, [x0, #1008] -ldr q20, [x0, #960] -ldr q24, [x0, #976] -sqrdmulh v15.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v27.s[0] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v5.4s -add v24.4s, v24.4s, v5.4s -sqrdmulh v5.4S, v24.4S, v9.s[1] -mul v24.4S, v24.4S,v27.s[1] -mla v24.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v24.4s -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v0.4S, v9.s[2] -mul v0.4S, v0.4S,v27.s[2] -mla v0.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -trn1 v0.4S, v20.4S, v5.4S -trn2 v29.4S, v20.4S, v5.4S -trn1 v4.4S, v15.4S, v24.4S -trn2 v19.4S, v15.4S, v24.4S -trn2 v15.2D, v0.2D, v4.2D -trn2 v24.2D, v29.2D, v19.2D -trn1 v20.2D, v0.2D, v4.2D -trn1 v5.2D, v29.2D, v19.2D -sqrdmulh v19.4S, v15.4S, v10.4S -mul v15.4S, v15.4S,v16.4S -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v24.4S, v10.4S -mul v24.4S, v24.4S,v16.4S -mla v24.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v24.4s -add v5.4s, v5.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v8.4S -mla v5.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v14.4S -mul v15.4S, v15.4S,v22.4S -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -str q20, [x0, #960] -str q24, [x0, #976] -str q19, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s deleted file mode 100644 index 34599c2..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_13_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_13_0 -.global _ntt_u32_full_neon_asm_var_4_4_13_0 -ntt_u32_full_neon_asm_var_4_4_13_0: -_ntt_u32_full_neon_asm_var_4_4_13_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -ldr q2, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v2.4S, v29.s[0] -ldr q1, [x0, #608] -mla v20.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v1.4S, v29.s[0] -ldr q0, [x0, #672] -mla v18.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v0.4S, v29.s[0] -ldr q15, [x0, #736] -mla v16.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v15.4S, v29.s[0] -ldr q14, [x0, #416] -ldr q13, [x0, #480] -mul v2.4S, v2.4S,v30.s[0] -sub v12.4s, v14.4s, v22.4s -mul v1.4S, v1.4S,v30.s[0] -add v14.4s, v14.4s, v22.4s -ldr q22, [x0, #288] -ldr q11, [x0, #352] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v20.4s -mla v1.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v20.4s -ldr q20, [x0, #32] -ldr q19, [x0, #96] -mul v0.4S, v0.4S,v30.s[0] -sub v10.4s, v22.4s, v18.4s -mul v15.4S, v15.4S,v30.s[0] -add v22.4s, v22.4s, v18.4s -ldr q18, [x0, #160] -ldr q9, [x0, #224] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v16.4s -mla v15.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sqrdmulh v3.4S, v13.4S, v29.s[1] -sub v8.4s, v20.4s, v2.4s -mul v13.4S, v13.4S,v30.s[1] -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v22.4S, v29.s[1] -sub v7.4s, v19.4s, v1.4s -mul v22.4S, v22.4S,v30.s[1] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v11.4S, v29.s[1] -sub v6.4s, v18.4s, v0.4s -mul v11.4S, v11.4S,v30.s[1] -add v18.4s, v18.4s, v0.4s -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v15.4s -sqrdmulh v0.4S, v12.4S, v29.s[2] -add v9.4s, v9.4s, v15.4s -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v21.4S, v29.s[2] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v10.4S, v29.s[2] -mla v11.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v17.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v18.4s, v14.4s -mul v21.4S, v21.4S,v30.s[2] -add v18.4s, v18.4s, v14.4s -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v13.4s -mla v21.4S, v3.4S, v31.s[0] -add v9.4s, v9.4s, v13.4s -mul v10.4S, v10.4S,v30.s[2] -sub v13.4s, v20.4s, v22.4s -mul v17.4S, v17.4S,v30.s[2] -add v20.4s, v20.4s, v22.4s -mla v10.4S, v2.4S, v31.s[0] -sub v2.4s, v19.4s, v11.4s -mla v17.4S, v1.4S, v31.s[0] -add v19.4s, v19.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -sqrdmulh v1.4S, v0.4S, v27.s[1] -sub v22.4s, v6.4s, v12.4s -mul v0.4S, v0.4S,v28.s[1] -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v27.s[0] -sub v3.4s, v16.4s, v21.4s -mul v18.4S, v18.4S,v28.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v9.4S, v27.s[0] -sub v14.4s, v8.4s, v10.4s -mul v9.4S, v9.4S,v28.s[0] -add v8.4s, v8.4s, v10.4s -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v7.4s, v17.4s -sqrdmulh v10.4S, v6.4S, v27.s[2] -add v7.4s, v7.4s, v17.4s -mla v0.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v16.4S, v27.s[2] -mla v18.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v22.4S, v27.s[3] -mla v9.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v3.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[2] -sub v17.4s, v13.4s, v15.4s -mul v16.4S, v16.4S,v28.s[2] -add v13.4s, v13.4s, v15.4s -mla v6.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v0.4s -mla v16.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -mul v22.4S, v22.4S,v28.s[3] -sub v0.4s, v20.4s, v18.4s -mul v3.4S, v3.4S,v28.s[3] -add v20.4s, v20.4s, v18.4s -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v9.4s -mla v3.4S, v21.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -sqrdmulh v21.4S, v10.4S, v25.s[3] -sub v18.4s, v8.4s, v6.4s -mul v10.4S, v10.4S,v26.s[3] -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v12.4S, v25.s[1] -sub v1.4s, v7.4s, v16.4s -mul v12.4S, v12.4S,v26.s[1] -add v7.4s, v7.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v25.s[0] -sub v15.4s, v14.4s, v22.4s -mul v19.4S, v19.4S,v26.s[0] -add v14.4s, v14.4s, v22.4s -mla v2.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v3.4s -sqrdmulh v22.4S, v7.4S, v23.s[0] -add v11.4s, v11.4s, v3.4s -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v2.4s -sqrdmulh v3.4S, v1.4S, v23.s[1] -add v13.4s, v13.4s, v2.4s -mla v12.4S, v6.4S, v31.s[0] -sub v6.4s, v17.4s, v10.4s -sqrdmulh v2.4S, v11.4S, v23.s[2] -add v17.4s, v17.4s, v10.4s -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v12.4s -sqrdmulh v10.4S, v9.4S, v23.s[3] -add v0.4s, v0.4s, v12.4s -mul v7.4S, v7.4S,v24.s[0] -sub v12.4s, v20.4s, v19.4s -mul v1.4S, v1.4S,v24.s[1] -add v20.4s, v20.4s, v19.4s -mla v7.4S, v22.4S, v31.s[0] -str q21, [x0, #352] -mla v1.4S, v3.4S, v31.s[0] -str q13, [x0, #288] -mul v11.4S, v11.4S,v24.s[2] -str q6, [x0, #480] -mul v9.4S, v9.4S,v24.s[3] -str q17, [x0, #416] -mla v11.4S, v2.4S, v31.s[0] -str q16, [x0, #224] -mla v9.4S, v10.4S, v31.s[0] -str q0, [x0, #160] -ldr q0, [x0, #944] -sqrdmulh v10.4S, v0.4S, v29.s[0] -str q12, [x0, #96] -mul v0.4S, v0.4S,v30.s[0] -str q20, [x0, #32] -ldr q20, [x0, #1008] -sqrdmulh v12.4S, v20.4S, v29.s[0] -sub v16.4s, v8.4s, v7.4s -str q16, [x0, #608] -mul v20.4S, v20.4S,v30.s[0] -add v8.4s, v8.4s, v7.4s -ldr q7, [x0, #816] -sqrdmulh v16.4S, v7.4S, v29.s[0] -sub v2.4s, v18.4s, v1.4s -str q8, [x0, #544] -mul v7.4S, v7.4S,v30.s[0] -add v18.4s, v18.4s, v1.4s -ldr q1, [x0, #880] -sqrdmulh v8.4S, v1.4S, v29.s[0] -sub v17.4s, v14.4s, v11.4s -str q2, [x0, #736] -mul v1.4S, v1.4S,v30.s[0] -add v14.4s, v14.4s, v11.4s -ldr q11, [x0, #560] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v9.4s -str q18, [x0, #672] -sqrdmulh v18.4S, v11.4S, v29.s[0] -add v15.4s, v15.4s, v9.4s -ldr q9, [x0, #624] -mla v20.4S, v12.4S, v31.s[0] -str q17, [x0, #864] -sqrdmulh v17.4S, v9.4S, v29.s[0] -ldr q12, [x0, #688] -mla v7.4S, v16.4S, v31.s[0] -str q14, [x0, #800] -sqrdmulh v14.4S, v12.4S, v29.s[0] -ldr q16, [x0, #752] -mla v1.4S, v8.4S, v31.s[0] -str q10, [x0, #992] -sqrdmulh v10.4S, v16.4S, v29.s[0] -ldr q8, [x0, #432] -ldr q2, [x0, #496] -mul v11.4S, v11.4S,v30.s[0] -sub v6.4s, v8.4s, v0.4s -str q15, [x0, #928] -mul v9.4S, v9.4S,v30.s[0] -add v8.4s, v8.4s, v0.4s -ldr q0, [x0, #304] -ldr q15, [x0, #368] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v20.4s -mla v9.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v20.4s -ldr q20, [x0, #48] -ldr q17, [x0, #112] -mul v12.4S, v12.4S,v30.s[0] -sub v13.4s, v0.4s, v7.4s -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v7.4s -ldr q7, [x0, #176] -ldr q3, [x0, #240] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v1.4s -mla v16.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sqrdmulh v10.4S, v2.4S, v29.s[1] -sub v21.4s, v20.4s, v11.4s -mul v2.4S, v2.4S,v30.s[1] -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v0.4S, v29.s[1] -sub v22.4s, v17.4s, v9.4s -mul v0.4S, v0.4S,v30.s[1] -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v15.4S, v29.s[1] -sub v19.4s, v7.4s, v12.4s -mul v15.4S, v15.4S,v30.s[1] -add v7.4s, v7.4s, v12.4s -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v16.4s -sqrdmulh v12.4S, v6.4S, v29.s[2] -add v3.4s, v3.4s, v16.4s -mla v2.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v18.4S, v29.s[2] -mla v0.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v13.4S, v29.s[2] -mla v15.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v14.4S, v29.s[2] -mul v6.4S, v6.4S,v30.s[2] -sub v16.4s, v7.4s, v8.4s -mul v18.4S, v18.4S,v30.s[2] -add v7.4s, v7.4s, v8.4s -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v2.4s -mla v18.4S, v10.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -mul v13.4S, v13.4S,v30.s[2] -sub v2.4s, v20.4s, v0.4s -mul v14.4S, v14.4S,v30.s[2] -add v20.4s, v20.4s, v0.4s -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v15.4s -mla v14.4S, v9.4S, v31.s[0] -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sqrdmulh v9.4S, v12.4S, v27.s[1] -sub v0.4s, v19.4s, v6.4s -mul v12.4S, v12.4S,v28.s[1] -add v19.4s, v19.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v27.s[0] -sub v10.4s, v1.4s, v18.4s -mul v7.4S, v7.4S,v28.s[0] -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v27.s[0] -sub v8.4s, v21.4s, v13.4s -mul v3.4S, v3.4S,v28.s[0] -add v21.4s, v21.4s, v13.4s -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v14.4s -sqrdmulh v13.4S, v19.4S, v27.s[2] -add v22.4s, v22.4s, v14.4s -mla v12.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v1.4S, v27.s[2] -mla v7.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v0.4S, v27.s[3] -mla v3.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v10.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[2] -sub v14.4s, v2.4s, v16.4s -mul v1.4S, v1.4S,v28.s[2] -add v2.4s, v2.4s, v16.4s -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v12.4s -mla v1.4S, v9.4S, v31.s[0] -add v11.4s, v11.4s, v12.4s -mul v0.4S, v0.4S,v28.s[3] -sub v12.4s, v20.4s, v7.4s -mul v10.4S, v10.4S,v28.s[3] -add v20.4s, v20.4s, v7.4s -mla v0.4S, v6.4S, v31.s[0] -sub v6.4s, v17.4s, v3.4s -mla v10.4S, v18.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[2] -mul v11.4S, v11.4S,v26.s[2] -sqrdmulh v18.4S, v13.4S, v25.s[3] -sub v7.4s, v21.4s, v19.4s -mul v13.4S, v13.4S,v26.s[3] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v25.s[1] -sub v9.4s, v22.4s, v1.4s -mul v6.4S, v6.4S,v26.s[1] -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v25.s[0] -sub v16.4s, v8.4s, v0.4s -mul v17.4S, v17.4S,v26.s[0] -add v8.4s, v8.4s, v0.4s -mla v11.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v10.4s -sqrdmulh v0.4S, v22.4S, v23.s[0] -add v15.4s, v15.4s, v10.4s -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v11.4s -sqrdmulh v10.4S, v9.4S, v23.s[1] -add v2.4s, v2.4s, v11.4s -mla v6.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v13.4s -sqrdmulh v11.4S, v15.4S, v23.s[2] -add v14.4s, v14.4s, v13.4s -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v6.4s -sqrdmulh v13.4S, v3.4S, v23.s[3] -add v12.4s, v12.4s, v6.4s -mul v22.4S, v22.4S,v24.s[0] -sub v6.4s, v20.4s, v17.4s -mul v9.4S, v9.4S,v24.s[1] -add v20.4s, v20.4s, v17.4s -mla v22.4S, v0.4S, v31.s[0] -str q18, [x0, #368] -mla v9.4S, v10.4S, v31.s[0] -str q2, [x0, #304] -mul v15.4S, v15.4S,v24.s[2] -str q19, [x0, #496] -mul v3.4S, v3.4S,v24.s[3] -str q14, [x0, #432] -mla v15.4S, v11.4S, v31.s[0] -str q1, [x0, #240] -mla v3.4S, v13.4S, v31.s[0] -str q12, [x0, #176] -ldr q12, [x0, #896] -sqrdmulh v13.4S, v12.4S, v29.s[0] -str q6, [x0, #112] -mul v12.4S, v12.4S,v30.s[0] -str q20, [x0, #48] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -sub v1.4s, v21.4s, v22.4s -str q1, [x0, #624] -mul v20.4S, v20.4S,v30.s[0] -add v21.4s, v21.4s, v22.4s -ldr q22, [x0, #768] -sqrdmulh v1.4S, v22.4S, v29.s[0] -sub v11.4s, v7.4s, v9.4s -str q21, [x0, #560] -mul v22.4S, v22.4S,v30.s[0] -add v7.4s, v7.4s, v9.4s -ldr q9, [x0, #832] -sqrdmulh v21.4S, v9.4S, v29.s[0] -sub v14.4s, v8.4s, v15.4s -str q11, [x0, #752] -mul v9.4S, v9.4S,v30.s[0] -add v8.4s, v8.4s, v15.4s -ldr q15, [x0, #512] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v3.4s -str q7, [x0, #688] -sqrdmulh v7.4S, v15.4S, v29.s[0] -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #576] -mla v20.4S, v6.4S, v31.s[0] -str q14, [x0, #880] -sqrdmulh v14.4S, v3.4S, v29.s[0] -ldr q6, [x0, #640] -mla v22.4S, v1.4S, v31.s[0] -str q8, [x0, #816] -sqrdmulh v8.4S, v6.4S, v29.s[0] -ldr q1, [x0, #704] -mla v9.4S, v21.4S, v31.s[0] -str q13, [x0, #1008] -sqrdmulh v13.4S, v1.4S, v29.s[0] -ldr q21, [x0, #384] -ldr q11, [x0, #448] -mul v15.4S, v15.4S,v30.s[0] -sub v19.4s, v21.4s, v12.4s -str q16, [x0, #944] -mul v3.4S, v3.4S,v30.s[0] -add v21.4s, v21.4s, v12.4s -ldr q12, [x0, #256] -ldr q16, [x0, #320] -mla v15.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v20.4s -mla v3.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -ldr q20, [x0, #0] -ldr q14, [x0, #64] -mul v6.4S, v6.4S,v30.s[0] -sub v2.4s, v12.4s, v22.4s -mul v1.4S, v1.4S,v30.s[0] -add v12.4s, v12.4s, v22.4s -ldr q22, [x0, #128] -ldr q10, [x0, #192] -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v9.4s -mla v1.4S, v13.4S, v31.s[0] -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sqrdmulh v13.4S, v11.4S, v29.s[1] -sub v18.4s, v20.4s, v15.4s -mul v11.4S, v11.4S,v30.s[1] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v29.s[1] -sub v0.4s, v14.4s, v3.4s -mul v12.4S, v12.4S,v30.s[1] -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v29.s[1] -sub v17.4s, v22.4s, v6.4s -mul v16.4S, v16.4S,v30.s[1] -add v22.4s, v22.4s, v6.4s -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v1.4s -sqrdmulh v6.4S, v19.4S, v29.s[2] -add v10.4s, v10.4s, v1.4s -mla v11.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v7.4S, v29.s[2] -mla v12.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v2.4S, v29.s[2] -mla v16.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v8.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v1.4s, v22.4s, v21.4s -mul v7.4S, v7.4S,v30.s[2] -add v22.4s, v22.4s, v21.4s -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v11.4s -mla v7.4S, v13.4S, v31.s[0] -add v10.4s, v10.4s, v11.4s -mul v2.4S, v2.4S,v30.s[2] -sub v11.4s, v20.4s, v12.4s -mul v8.4S, v8.4S,v30.s[2] -add v20.4s, v20.4s, v12.4s -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v16.4s -mla v8.4S, v3.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v27.s[1] -mul v1.4S, v1.4S,v28.s[1] -sqrdmulh v3.4S, v6.4S, v27.s[1] -sub v12.4s, v17.4s, v19.4s -mul v6.4S, v6.4S,v28.s[1] -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v27.s[0] -sub v13.4s, v9.4s, v7.4s -mul v22.4S, v22.4S,v28.s[0] -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v10.4S, v27.s[0] -sub v21.4s, v18.4s, v2.4s -mul v10.4S, v10.4S,v28.s[0] -add v18.4s, v18.4s, v2.4s -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v8.4s -sqrdmulh v2.4S, v17.4S, v27.s[2] -add v0.4s, v0.4s, v8.4s -mla v6.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v9.4S, v27.s[2] -mla v22.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v12.4S, v27.s[3] -mla v10.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v13.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[2] -sub v8.4s, v11.4s, v1.4s -mul v9.4S, v9.4S,v28.s[2] -add v11.4s, v11.4s, v1.4s -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v6.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v6.4s -mul v12.4S, v12.4S,v28.s[3] -sub v6.4s, v20.4s, v22.4s -mul v13.4S, v13.4S,v28.s[3] -add v20.4s, v20.4s, v22.4s -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v10.4s -mla v13.4S, v7.4S, v31.s[0] -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v25.s[2] -mul v15.4S, v15.4S,v26.s[2] -sqrdmulh v7.4S, v2.4S, v25.s[3] -sub v22.4s, v18.4s, v17.4s -mul v2.4S, v2.4S,v26.s[3] -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v25.s[1] -sub v3.4s, v0.4s, v9.4s -mul v19.4S, v19.4S,v26.s[1] -add v0.4s, v0.4s, v9.4s -sqrdmulh v9.4S, v14.4S, v25.s[0] -sub v1.4s, v21.4s, v12.4s -mul v14.4S, v14.4S,v26.s[0] -add v21.4s, v21.4s, v12.4s -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v13.4s -sqrdmulh v12.4S, v0.4S, v23.s[0] -add v16.4s, v16.4s, v13.4s -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v15.4s -sqrdmulh v13.4S, v3.4S, v23.s[1] -add v11.4s, v11.4s, v15.4s -mla v19.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v2.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -add v8.4s, v8.4s, v2.4s -mla v14.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v19.4s -sqrdmulh v2.4S, v10.4S, v23.s[3] -add v6.4s, v6.4s, v19.4s -mul v0.4S, v0.4S,v24.s[0] -sub v19.4s, v20.4s, v14.4s -mul v3.4S, v3.4S,v24.s[1] -add v20.4s, v20.4s, v14.4s -mla v0.4S, v12.4S, v31.s[0] -str q7, [x0, #320] -mla v3.4S, v13.4S, v31.s[0] -str q11, [x0, #256] -mul v16.4S, v16.4S,v24.s[2] -str q17, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -str q8, [x0, #384] -mla v16.4S, v15.4S, v31.s[0] -str q9, [x0, #192] -mla v10.4S, v2.4S, v31.s[0] -str q6, [x0, #128] -ldr q6, [x0, #912] -sqrdmulh v2.4S, v6.4S, v29.s[0] -str q19, [x0, #64] -mul v6.4S, v6.4S,v30.s[0] -str q20, [x0, #0] -ldr q20, [x0, #976] -sqrdmulh v19.4S, v20.4S, v29.s[0] -sub v9.4s, v18.4s, v0.4s -str q9, [x0, #576] -mul v20.4S, v20.4S,v30.s[0] -add v18.4s, v18.4s, v0.4s -ldr q0, [x0, #784] -sqrdmulh v9.4S, v0.4S, v29.s[0] -sub v15.4s, v22.4s, v3.4s -str q18, [x0, #512] -mul v0.4S, v0.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -ldr q3, [x0, #848] -sqrdmulh v18.4S, v3.4S, v29.s[0] -sub v8.4s, v21.4s, v16.4s -str q15, [x0, #704] -mul v3.4S, v3.4S,v30.s[0] -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #528] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v10.4s -str q22, [x0, #640] -sqrdmulh v22.4S, v16.4S, v29.s[0] -add v1.4s, v1.4s, v10.4s -ldr q10, [x0, #592] -mla v20.4S, v19.4S, v31.s[0] -str q8, [x0, #832] -sqrdmulh v8.4S, v10.4S, v29.s[0] -ldr q19, [x0, #656] -mla v0.4S, v9.4S, v31.s[0] -str q21, [x0, #768] -sqrdmulh v21.4S, v19.4S, v29.s[0] -ldr q9, [x0, #720] -mla v3.4S, v18.4S, v31.s[0] -str q2, [x0, #960] -sqrdmulh v2.4S, v9.4S, v29.s[0] -ldr q18, [x0, #400] -ldr q15, [x0, #464] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v18.4s, v6.4s -str q1, [x0, #896] -mul v10.4S, v10.4S,v30.s[0] -add v18.4s, v18.4s, v6.4s -ldr q6, [x0, #272] -ldr q1, [x0, #336] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v20.4s -mla v10.4S, v8.4S, v31.s[0] -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #16] -ldr q8, [x0, #80] -mul v19.4S, v19.4S,v30.s[0] -sub v11.4s, v6.4s, v0.4s -mul v9.4S, v9.4S,v30.s[0] -add v6.4s, v6.4s, v0.4s -ldr q0, [x0, #144] -ldr q13, [x0, #208] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v3.4s -mla v9.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sqrdmulh v2.4S, v15.4S, v29.s[1] -sub v7.4s, v20.4s, v16.4s -mul v15.4S, v15.4S,v30.s[1] -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v29.s[1] -sub v12.4s, v8.4s, v10.4s -mul v6.4S, v6.4S,v30.s[1] -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v1.4S, v29.s[1] -sub v14.4s, v0.4s, v19.4s -mul v1.4S, v1.4S,v30.s[1] -add v0.4s, v0.4s, v19.4s -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v9.4s -sqrdmulh v19.4S, v17.4S, v29.s[2] -add v13.4s, v13.4s, v9.4s -mla v15.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v22.4S, v29.s[2] -mla v6.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v11.4S, v29.s[2] -mla v1.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v21.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v9.4s, v0.4s, v18.4s -mul v22.4S, v22.4S,v30.s[2] -add v0.4s, v0.4s, v18.4s -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v15.4s -mla v22.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v15.4s -mul v11.4S, v11.4S,v30.s[2] -sub v15.4s, v20.4s, v6.4s -mul v21.4S, v21.4S,v30.s[2] -add v20.4s, v20.4s, v6.4s -mla v11.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v1.4s -mla v21.4S, v10.4S, v31.s[0] -add v8.4s, v8.4s, v1.4s -sqrdmulh v29.4S, v9.4S, v27.s[1] -mul v9.4S, v9.4S,v28.s[1] -sqrdmulh v30.4S, v19.4S, v27.s[1] -sub v1.4s, v14.4s, v17.4s -mul v19.4S, v19.4S,v28.s[1] -add v14.4s, v14.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -sub v10.4s, v3.4s, v22.4s -mul v0.4S, v0.4S,v28.s[0] -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v27.s[0] -sub v6.4s, v7.4s, v11.4s -mul v13.4S, v13.4S,v28.s[0] -add v7.4s, v7.4s, v11.4s -mla v9.4S, v29.4S, v31.s[0] -sub v29.4s, v12.4s, v21.4s -sqrdmulh v11.4S, v14.4S, v27.s[2] -add v12.4s, v12.4s, v21.4s -mla v19.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v3.4S, v27.s[2] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v1.4S, v27.s[3] -mla v13.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v10.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[2] -sub v21.4s, v15.4s, v9.4s -mul v3.4S, v3.4S,v28.s[2] -add v15.4s, v15.4s, v9.4s -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v19.4s -mla v3.4S, v30.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -mul v1.4S, v1.4S,v28.s[3] -sub v19.4s, v20.4s, v0.4s -mul v10.4S, v10.4S,v28.s[3] -add v20.4s, v20.4s, v0.4s -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v13.4s -mla v10.4S, v22.4S, v31.s[0] -add v8.4s, v8.4s, v13.4s -sqrdmulh v27.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sqrdmulh v28.4S, v11.4S, v25.s[3] -sub v13.4s, v7.4s, v14.4s -mul v11.4S, v11.4S,v26.s[3] -add v7.4s, v7.4s, v14.4s -sqrdmulh v14.4S, v17.4S, v25.s[1] -sub v22.4s, v12.4s, v3.4s -mul v17.4S, v17.4S,v26.s[1] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v8.4S, v25.s[0] -sub v0.4s, v6.4s, v1.4s -mul v8.4S, v8.4S,v26.s[0] -add v6.4s, v6.4s, v1.4s -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v10.4s -sqrdmulh v25.4S, v12.4S, v23.s[0] -add v29.4s, v29.4s, v10.4s -mla v11.4S, v28.4S, v31.s[0] -sub v28.4s, v15.4s, v16.4s -sqrdmulh v10.4S, v22.4S, v23.s[1] -add v15.4s, v15.4s, v16.4s -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v11.4s -sqrdmulh v16.4S, v29.4S, v23.s[2] -add v21.4s, v21.4s, v11.4s -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v17.4s -sqrdmulh v11.4S, v27.4S, v23.s[3] -add v19.4s, v19.4s, v17.4s -mul v12.4S, v12.4S,v24.s[0] -sub v17.4s, v20.4s, v8.4s -mul v22.4S, v22.4S,v24.s[1] -add v20.4s, v20.4s, v8.4s -mla v12.4S, v25.4S, v31.s[0] -str q28, [x0, #336] -mla v22.4S, v10.4S, v31.s[0] -str q15, [x0, #272] -mul v29.4S, v29.4S,v24.s[2] -str q14, [x0, #464] -mul v27.4S, v27.4S,v24.s[3] -str q21, [x0, #400] -mla v29.4S, v16.4S, v31.s[0] -str q3, [x0, #208] -mla v27.4S, v11.4S, v31.s[0] -str q19, [x0, #144] -str q17, [x0, #80] -str q20, [x0, #16] -sub v20.4s, v7.4s, v12.4s -str q20, [x0, #592] -add v7.4s, v7.4s, v12.4s -sub v12.4s, v13.4s, v22.4s -str q7, [x0, #528] -add v13.4s, v13.4s, v22.4s -sub v22.4s, v6.4s, v29.4s -str q12, [x0, #720] -add v6.4s, v6.4s, v29.4s -sub v29.4s, v0.4s, v27.4s -str q13, [x0, #656] -add v0.4s, v0.4s, v27.4s -str q22, [x0, #848] -str q6, [x0, #784] -str q29, [x0, #976] -str q0, [x0, #912] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q18, [x17, #+160] -ldr q2, [x17, #+176] -ldr q9, [x17, #+192] -ldr q30, [x17, #+208] -ldr q1, [x17, #+224] -ldr q26, [x17, #+240] -ldr q8, [x0, #32] -ldr q25, [x0, #48] -ldr q28, [x0, #0] -ldr q10, [x0, #16] -sqrdmulh v15.4S, v8.4S, v5.s[0] -mul v8.4S, v8.4S,v4.s[0] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -sqrdmulh v8.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v5.s[1] -mul v10.4S, v10.4S,v4.s[1] -mla v10.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v10.4s -add v28.4s, v28.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v5.s[2] -mul v8.4S, v8.4S,v4.s[2] -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -trn1 v8.4S, v28.4S, v25.4S -trn2 v14.4S, v28.4S, v25.4S -trn1 v21.4S, v15.4S, v10.4S -trn2 v16.4S, v15.4S, v10.4S -trn2 v15.2D, v8.2D, v21.2D -trn2 v10.2D, v14.2D, v16.2D -trn1 v28.2D, v8.2D, v21.2D -trn1 v25.2D, v14.2D, v16.2D -sqrdmulh v16.4S, v15.4S, v2.4S -mul v15.4S, v15.4S,v18.4S -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v28.4s, v15.4s -add v28.4s, v28.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v2.4S -mul v10.4S, v10.4S,v18.4S -mla v10.4S, v15.4S, v31.s[0] -sub v15.4s, v25.4s, v10.4s -add v25.4s, v25.4s, v10.4s -sqrdmulh v10.4S, v25.4S, v30.4S -mul v25.4S, v25.4S,v9.4S -mla v25.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sqrdmulh v25.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v1.4S -mla v15.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v15.4s -add v16.4s, v16.4s, v15.4s -str q28, [x0, #0] -str q10, [x0, #16] -str q16, [x0, #32] -str q25, [x0, #48] -ldr q25, [x17, #+256] -ldr q16, [x17, #+272] -ldr q10, [x17, #+288] -ldr q28, [x17, #+304] -ldr q15, [x17, #+320] -ldr q14, [x17, #+336] -ldr q21, [x17, #+352] -ldr q8, [x17, #+368] -ldr q26, [x0, #96] -ldr q1, [x0, #112] -ldr q30, [x0, #64] -ldr q9, [x0, #80] -sqrdmulh v2.4S, v26.4S, v16.s[0] -mul v26.4S, v26.4S,v25.s[0] -mla v26.4S, v2.4S, v31.s[0] -sub v2.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v1.4S, v16.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v26.4S, v31.s[0] -sub v26.4s, v9.4s, v1.4s -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v9.4S, v16.s[1] -mul v9.4S, v9.4S,v25.s[1] -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v30.4s, v9.4s -add v30.4s, v30.4s, v9.4s -sqrdmulh v9.4S, v26.4S, v16.s[2] -mul v26.4S, v26.4S,v25.s[2] -mla v26.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -trn1 v26.4S, v30.4S, v1.4S -trn2 v18.4S, v30.4S, v1.4S -trn1 v5.4S, v2.4S, v9.4S -trn2 v4.4S, v2.4S, v9.4S -trn2 v2.2D, v26.2D, v5.2D -trn2 v9.2D, v18.2D, v4.2D -trn1 v30.2D, v26.2D, v5.2D -trn1 v1.2D, v18.2D, v4.2D -sqrdmulh v4.4S, v2.4S, v28.4S -mul v2.4S, v2.4S,v10.4S -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v30.4s, v2.4s -add v30.4s, v30.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v28.4S -mul v9.4S, v9.4S,v10.4S -mla v9.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -sqrdmulh v9.4S, v1.4S, v14.4S -mul v1.4S, v1.4S,v15.4S -mla v1.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v1.4s -add v30.4s, v30.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v8.4S -mul v2.4S, v2.4S,v21.4S -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -str q30, [x0, #64] -str q9, [x0, #80] -str q4, [x0, #96] -str q1, [x0, #112] -ldr q1, [x17, #+384] -ldr q4, [x17, #+400] -ldr q9, [x17, #+416] -ldr q30, [x17, #+432] -ldr q2, [x17, #+448] -ldr q18, [x17, #+464] -ldr q5, [x17, #+480] -ldr q26, [x17, #+496] -ldr q8, [x0, #160] -ldr q21, [x0, #176] -ldr q14, [x0, #128] -ldr q15, [x0, #144] -sqrdmulh v28.4S, v8.4S, v4.s[0] -mul v8.4S, v8.4S,v1.s[0] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v14.4s, v8.4s -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v4.s[0] -mul v21.4S, v21.4S,v1.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v4.s[1] -mul v15.4S, v15.4S,v1.s[1] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v4.s[2] -mul v8.4S, v8.4S,v1.s[2] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -trn1 v8.4S, v14.4S, v21.4S -trn2 v10.4S, v14.4S, v21.4S -trn1 v16.4S, v28.4S, v15.4S -trn2 v25.4S, v28.4S, v15.4S -trn2 v28.2D, v8.2D, v16.2D -trn2 v15.2D, v10.2D, v25.2D -trn1 v14.2D, v8.2D, v16.2D -trn1 v21.2D, v10.2D, v25.2D -sqrdmulh v25.4S, v28.4S, v30.4S -mul v28.4S, v28.4S,v9.4S -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v14.4s, v28.4s -add v14.4s, v14.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v30.4S -mul v15.4S, v15.4S,v9.4S -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v18.4S -mul v21.4S, v21.4S,v2.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v28.4S, v26.4S -mul v28.4S, v28.4S,v5.4S -mla v28.4S, v21.4S, v31.s[0] -sub v21.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -str q14, [x0, #128] -str q15, [x0, #144] -str q25, [x0, #160] -str q21, [x0, #176] -ldr q21, [x17, #+512] -ldr q25, [x17, #+528] -ldr q15, [x17, #+544] -ldr q14, [x17, #+560] -ldr q28, [x17, #+576] -ldr q10, [x17, #+592] -ldr q16, [x17, #+608] -ldr q8, [x17, #+624] -ldr q26, [x0, #224] -ldr q5, [x0, #240] -ldr q18, [x0, #192] -ldr q2, [x0, #208] -sqrdmulh v30.4S, v26.4S, v25.s[0] -mul v26.4S, v26.4S,v21.s[0] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v25.s[0] -mul v5.4S, v5.4S,v21.s[0] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -sqrdmulh v5.4S, v2.4S, v25.s[1] -mul v2.4S, v2.4S,v21.s[1] -mla v2.4S, v5.4S, v31.s[0] -sub v5.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v26.4S, v25.s[2] -mul v26.4S, v26.4S,v21.s[2] -mla v26.4S, v2.4S, v31.s[0] -sub v2.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -trn1 v26.4S, v18.4S, v5.4S -trn2 v9.4S, v18.4S, v5.4S -trn1 v4.4S, v30.4S, v2.4S -trn2 v1.4S, v30.4S, v2.4S -trn2 v30.2D, v26.2D, v4.2D -trn2 v2.2D, v9.2D, v1.2D -trn1 v18.2D, v26.2D, v4.2D -trn1 v5.2D, v9.2D, v1.2D -sqrdmulh v1.4S, v30.4S, v14.4S -mul v30.4S, v30.4S,v15.4S -mla v30.4S, v1.4S, v31.s[0] -sub v1.4s, v18.4s, v30.4s -add v18.4s, v18.4s, v30.4s -sqrdmulh v30.4S, v2.4S, v14.4S -mul v2.4S, v2.4S,v15.4S -mla v2.4S, v30.4S, v31.s[0] -sub v30.4s, v5.4s, v2.4s -add v5.4s, v5.4s, v2.4s -sqrdmulh v2.4S, v5.4S, v10.4S -mul v5.4S, v5.4S,v28.4S -mla v5.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v5.4s -add v18.4s, v18.4s, v5.4s -sqrdmulh v5.4S, v30.4S, v8.4S -mul v30.4S, v30.4S,v16.4S -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -str q18, [x0, #192] -str q2, [x0, #208] -str q1, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q1, [x17, #+656] -ldr q2, [x17, #+672] -ldr q18, [x17, #+688] -ldr q30, [x17, #+704] -ldr q9, [x17, #+720] -ldr q4, [x17, #+736] -ldr q26, [x17, #+752] -ldr q8, [x0, #288] -ldr q16, [x0, #304] -ldr q10, [x0, #256] -ldr q28, [x0, #272] -sqrdmulh v14.4S, v8.4S, v1.s[0] -mul v8.4S, v8.4S,v5.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v1.s[0] -mul v16.4S, v16.4S,v5.s[0] -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -sqrdmulh v16.4S, v28.4S, v1.s[1] -mul v28.4S, v28.4S,v5.s[1] -mla v28.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v1.s[2] -mul v8.4S, v8.4S,v5.s[2] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v14.4s, v8.4s -add v14.4s, v14.4s, v8.4s -trn1 v8.4S, v10.4S, v16.4S -trn2 v15.4S, v10.4S, v16.4S -trn1 v25.4S, v14.4S, v28.4S -trn2 v21.4S, v14.4S, v28.4S -trn2 v14.2D, v8.2D, v25.2D -trn2 v28.2D, v15.2D, v21.2D -trn1 v10.2D, v8.2D, v25.2D -trn1 v16.2D, v15.2D, v21.2D -sqrdmulh v21.4S, v14.4S, v18.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v28.4S, v18.4S -mul v28.4S, v28.4S,v2.4S -mla v28.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v28.4s -add v16.4s, v16.4s, v28.4s -sqrdmulh v28.4S, v16.4S, v9.4S -mul v16.4S, v16.4S,v30.4S -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v26.4S -mul v14.4S, v14.4S,v4.4S -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -str q10, [x0, #256] -str q28, [x0, #272] -str q21, [x0, #288] -str q16, [x0, #304] -ldr q16, [x17, #+768] -ldr q21, [x17, #+784] -ldr q28, [x17, #+800] -ldr q10, [x17, #+816] -ldr q14, [x17, #+832] -ldr q15, [x17, #+848] -ldr q25, [x17, #+864] -ldr q8, [x17, #+880] -ldr q26, [x0, #352] -ldr q4, [x0, #368] -ldr q9, [x0, #320] -ldr q30, [x0, #336] -sqrdmulh v18.4S, v26.4S, v21.s[0] -mul v26.4S, v26.4S,v16.s[0] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v26.4s -add v9.4s, v9.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v21.s[0] -mul v4.4S, v4.4S,v16.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v4.4s -add v30.4s, v30.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v21.s[1] -mul v30.4S, v30.4S,v16.s[1] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v30.4s -add v9.4s, v9.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v21.s[2] -mul v26.4S, v26.4S,v16.s[2] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -trn1 v26.4S, v9.4S, v4.4S -trn2 v2.4S, v9.4S, v4.4S -trn1 v1.4S, v18.4S, v30.4S -trn2 v5.4S, v18.4S, v30.4S -trn2 v18.2D, v26.2D, v1.2D -trn2 v30.2D, v2.2D, v5.2D -trn1 v9.2D, v26.2D, v1.2D -trn1 v4.2D, v2.2D, v5.2D -sqrdmulh v5.4S, v18.4S, v10.4S -mul v18.4S, v18.4S,v28.4S -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v18.4s -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v10.4S -mul v30.4S, v30.4S,v28.4S -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v4.4s, v30.4s -add v4.4s, v4.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v15.4S -mul v4.4S, v4.4S,v14.4S -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v9.4s, v4.4s -add v9.4s, v9.4s, v4.4s -sqrdmulh v4.4S, v18.4S, v8.4S -mul v18.4S, v18.4S,v25.4S -mla v18.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v18.4s -add v5.4s, v5.4s, v18.4s -str q9, [x0, #320] -str q30, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q30, [x17, #+928] -ldr q9, [x17, #+944] -ldr q18, [x17, #+960] -ldr q2, [x17, #+976] -ldr q1, [x17, #+992] -ldr q26, [x17, #+1008] -ldr q8, [x0, #416] -ldr q25, [x0, #432] -ldr q15, [x0, #384] -ldr q14, [x0, #400] -sqrdmulh v10.4S, v8.4S, v5.s[0] -mul v8.4S, v8.4S,v4.s[0] -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -sqrdmulh v8.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v8.4S, v31.s[0] -sub v8.4s, v14.4s, v25.4s -add v14.4s, v14.4s, v25.4s -sqrdmulh v25.4S, v14.4S, v5.s[1] -mul v14.4S, v14.4S,v4.s[1] -mla v14.4S, v25.4S, v31.s[0] -sub v25.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v8.4S, v5.s[2] -mul v8.4S, v8.4S,v4.s[2] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -trn1 v8.4S, v15.4S, v25.4S -trn2 v28.4S, v15.4S, v25.4S -trn1 v21.4S, v10.4S, v14.4S -trn2 v16.4S, v10.4S, v14.4S -trn2 v10.2D, v8.2D, v21.2D -trn2 v14.2D, v28.2D, v16.2D -trn1 v15.2D, v8.2D, v21.2D -trn1 v25.2D, v28.2D, v16.2D -sqrdmulh v16.4S, v10.4S, v9.4S -mul v10.4S, v10.4S,v30.4S -mla v10.4S, v16.4S, v31.s[0] -sub v16.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v14.4S, v9.4S -mul v14.4S, v14.4S,v30.4S -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v25.4s, v14.4s -add v25.4s, v25.4s, v14.4s -sqrdmulh v14.4S, v25.4S, v2.4S -mul v25.4S, v25.4S,v18.4S -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v25.4s -add v15.4s, v15.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v26.4S -mul v10.4S, v10.4S,v1.4S -mla v10.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -str q15, [x0, #384] -str q14, [x0, #400] -str q16, [x0, #416] -str q25, [x0, #432] -ldr q25, [x17, #+1024] -ldr q16, [x17, #+1040] -ldr q14, [x17, #+1056] -ldr q15, [x17, #+1072] -ldr q10, [x17, #+1088] -ldr q28, [x17, #+1104] -ldr q21, [x17, #+1120] -ldr q8, [x17, #+1136] -ldr q26, [x0, #480] -ldr q1, [x0, #496] -ldr q2, [x0, #448] -ldr q18, [x0, #464] -sqrdmulh v9.4S, v26.4S, v16.s[0] -mul v26.4S, v26.4S,v25.s[0] -mla v26.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v1.4S, v16.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v26.4S, v31.s[0] -sub v26.4s, v18.4s, v1.4s -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v16.s[1] -mul v18.4S, v18.4S,v25.s[1] -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v16.s[2] -mul v26.4S, v26.4S,v25.s[2] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v26.4s -add v9.4s, v9.4s, v26.4s -trn1 v26.4S, v2.4S, v1.4S -trn2 v30.4S, v2.4S, v1.4S -trn1 v5.4S, v9.4S, v18.4S -trn2 v4.4S, v9.4S, v18.4S -trn2 v9.2D, v26.2D, v5.2D -trn2 v18.2D, v30.2D, v4.2D -trn1 v2.2D, v26.2D, v5.2D -trn1 v1.2D, v30.2D, v4.2D -sqrdmulh v4.4S, v9.4S, v15.4S -mul v9.4S, v9.4S,v14.4S -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v18.4S, v15.4S -mul v18.4S, v18.4S,v14.4S -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v1.4S, v28.4S -mul v1.4S, v1.4S,v10.4S -mla v1.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v9.4S, v8.4S -mul v9.4S, v9.4S,v21.4S -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v9.4s -add v4.4s, v4.4s, v9.4s -str q2, [x0, #448] -str q18, [x0, #464] -str q4, [x0, #480] -str q1, [x0, #496] -ldr q1, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q18, [x17, #+1184] -ldr q2, [x17, #+1200] -ldr q9, [x17, #+1216] -ldr q30, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q26, [x17, #+1264] -ldr q8, [x0, #544] -ldr q21, [x0, #560] -ldr q28, [x0, #512] -ldr q10, [x0, #528] -sqrdmulh v15.4S, v8.4S, v4.s[0] -mul v8.4S, v8.4S,v1.s[0] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v4.s[0] -mul v21.4S, v21.4S,v1.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v4.s[1] -mul v10.4S, v10.4S,v1.s[1] -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v28.4s, v10.4s -add v28.4s, v28.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v4.s[2] -mul v8.4S, v8.4S,v1.s[2] -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -trn1 v8.4S, v28.4S, v21.4S -trn2 v14.4S, v28.4S, v21.4S -trn1 v16.4S, v15.4S, v10.4S -trn2 v25.4S, v15.4S, v10.4S -trn2 v15.2D, v8.2D, v16.2D -trn2 v10.2D, v14.2D, v25.2D -trn1 v28.2D, v8.2D, v16.2D -trn1 v21.2D, v14.2D, v25.2D -sqrdmulh v25.4S, v15.4S, v2.4S -mul v15.4S, v15.4S,v18.4S -mla v15.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v15.4s -add v28.4s, v28.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v2.4S -mul v10.4S, v10.4S,v18.4S -mla v10.4S, v15.4S, v31.s[0] -sub v15.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v21.4S, v30.4S -mul v21.4S, v21.4S,v9.4S -mla v21.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v21.4s -add v28.4s, v28.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v5.4S -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v25.4s, v15.4s -add v25.4s, v25.4s, v15.4s -str q28, [x0, #512] -str q10, [x0, #528] -str q25, [x0, #544] -str q21, [x0, #560] -ldr q21, [x17, #+1280] -ldr q25, [x17, #+1296] -ldr q10, [x17, #+1312] -ldr q28, [x17, #+1328] -ldr q15, [x17, #+1344] -ldr q14, [x17, #+1360] -ldr q16, [x17, #+1376] -ldr q8, [x17, #+1392] -ldr q26, [x0, #608] -ldr q5, [x0, #624] -ldr q30, [x0, #576] -ldr q9, [x0, #592] -sqrdmulh v2.4S, v26.4S, v25.s[0] -mul v26.4S, v26.4S,v21.s[0] -mla v26.4S, v2.4S, v31.s[0] -sub v2.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v25.s[0] -mul v5.4S, v5.4S,v21.s[0] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v25.s[1] -mul v9.4S, v9.4S,v21.s[1] -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v9.4s -add v30.4s, v30.4s, v9.4s -sqrdmulh v9.4S, v26.4S, v25.s[2] -mul v26.4S, v26.4S,v21.s[2] -mla v26.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -trn1 v26.4S, v30.4S, v5.4S -trn2 v18.4S, v30.4S, v5.4S -trn1 v4.4S, v2.4S, v9.4S -trn2 v1.4S, v2.4S, v9.4S -trn2 v2.2D, v26.2D, v4.2D -trn2 v9.2D, v18.2D, v1.2D -trn1 v30.2D, v26.2D, v4.2D -trn1 v5.2D, v18.2D, v1.2D -sqrdmulh v1.4S, v2.4S, v28.4S -mul v2.4S, v2.4S,v10.4S -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v30.4s, v2.4s -add v30.4s, v30.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v28.4S -mul v9.4S, v9.4S,v10.4S -mla v9.4S, v2.4S, v31.s[0] -sub v2.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -sqrdmulh v9.4S, v5.4S, v14.4S -mul v5.4S, v5.4S,v15.4S -mla v5.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v2.4S, v8.4S -mul v2.4S, v2.4S,v16.4S -mla v2.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -str q30, [x0, #576] -str q9, [x0, #592] -str q1, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q1, [x17, #+1424] -ldr q9, [x17, #+1440] -ldr q30, [x17, #+1456] -ldr q2, [x17, #+1472] -ldr q18, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q26, [x17, #+1520] -ldr q8, [x0, #672] -ldr q16, [x0, #688] -ldr q14, [x0, #640] -ldr q15, [x0, #656] -sqrdmulh v28.4S, v8.4S, v1.s[0] -mul v8.4S, v8.4S,v5.s[0] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v14.4s, v8.4s -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v1.s[0] -mul v16.4S, v16.4S,v5.s[0] -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v1.s[1] -mul v15.4S, v15.4S,v5.s[1] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v1.s[2] -mul v8.4S, v8.4S,v5.s[2] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -trn1 v8.4S, v14.4S, v16.4S -trn2 v10.4S, v14.4S, v16.4S -trn1 v25.4S, v28.4S, v15.4S -trn2 v21.4S, v28.4S, v15.4S -trn2 v28.2D, v8.2D, v25.2D -trn2 v15.2D, v10.2D, v21.2D -trn1 v14.2D, v8.2D, v25.2D -trn1 v16.2D, v10.2D, v21.2D -sqrdmulh v21.4S, v28.4S, v30.4S -mul v28.4S, v28.4S,v9.4S -mla v28.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v28.4s -add v14.4s, v14.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v30.4S -mul v15.4S, v15.4S,v9.4S -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v15.4s -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v18.4S -mul v16.4S, v16.4S,v2.4S -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v16.4s -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v28.4S, v26.4S -mul v28.4S, v28.4S,v4.4S -mla v28.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -str q14, [x0, #640] -str q15, [x0, #656] -str q21, [x0, #672] -str q16, [x0, #688] -ldr q16, [x17, #+1536] -ldr q21, [x17, #+1552] -ldr q15, [x17, #+1568] -ldr q14, [x17, #+1584] -ldr q28, [x17, #+1600] -ldr q10, [x17, #+1616] -ldr q25, [x17, #+1632] -ldr q8, [x17, #+1648] -ldr q26, [x0, #736] -ldr q4, [x0, #752] -ldr q18, [x0, #704] -ldr q2, [x0, #720] -sqrdmulh v30.4S, v26.4S, v21.s[0] -mul v26.4S, v26.4S,v16.s[0] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v21.s[0] -mul v4.4S, v4.4S,v16.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v21.s[1] -mul v2.4S, v2.4S,v16.s[1] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v26.4S, v21.s[2] -mul v26.4S, v26.4S,v16.s[2] -mla v26.4S, v2.4S, v31.s[0] -sub v2.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -trn1 v26.4S, v18.4S, v4.4S -trn2 v9.4S, v18.4S, v4.4S -trn1 v1.4S, v30.4S, v2.4S -trn2 v5.4S, v30.4S, v2.4S -trn2 v30.2D, v26.2D, v1.2D -trn2 v2.2D, v9.2D, v5.2D -trn1 v18.2D, v26.2D, v1.2D -trn1 v4.2D, v9.2D, v5.2D -sqrdmulh v5.4S, v30.4S, v14.4S -mul v30.4S, v30.4S,v15.4S -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v18.4s, v30.4s -add v18.4s, v18.4s, v30.4s -sqrdmulh v30.4S, v2.4S, v14.4S -mul v2.4S, v2.4S,v15.4S -mla v2.4S, v30.4S, v31.s[0] -sub v30.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v10.4S -mul v4.4S, v4.4S,v28.4S -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v4.4s -add v18.4s, v18.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v8.4S -mul v30.4S, v30.4S,v25.4S -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -str q18, [x0, #704] -str q2, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q2, [x17, #+1696] -ldr q18, [x17, #+1712] -ldr q30, [x17, #+1728] -ldr q9, [x17, #+1744] -ldr q1, [x17, #+1760] -ldr q26, [x17, #+1776] -ldr q8, [x0, #800] -ldr q25, [x0, #816] -ldr q10, [x0, #768] -ldr q28, [x0, #784] -sqrdmulh v14.4S, v8.4S, v5.s[0] -mul v8.4S, v8.4S,v4.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v8.4S, v31.s[0] -sub v8.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v5.s[1] -mul v28.4S, v28.4S,v4.s[1] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v5.s[2] -mul v8.4S, v8.4S,v4.s[2] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v14.4s, v8.4s -add v14.4s, v14.4s, v8.4s -trn1 v8.4S, v10.4S, v25.4S -trn2 v15.4S, v10.4S, v25.4S -trn1 v21.4S, v14.4S, v28.4S -trn2 v16.4S, v14.4S, v28.4S -trn2 v14.2D, v8.2D, v21.2D -trn2 v28.2D, v15.2D, v16.2D -trn1 v10.2D, v8.2D, v21.2D -trn1 v25.2D, v15.2D, v16.2D -sqrdmulh v16.4S, v14.4S, v18.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v28.4S, v18.4S -mul v28.4S, v28.4S,v2.4S -mla v28.4S, v14.4S, v31.s[0] -sub v14.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v25.4S, v9.4S -mul v25.4S, v25.4S,v30.4S -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -sqrdmulh v25.4S, v14.4S, v26.4S -mul v14.4S, v14.4S,v1.4S -mla v14.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -str q10, [x0, #768] -str q28, [x0, #784] -str q16, [x0, #800] -str q25, [x0, #816] -ldr q25, [x17, #+1792] -ldr q16, [x17, #+1808] -ldr q28, [x17, #+1824] -ldr q10, [x17, #+1840] -ldr q14, [x17, #+1856] -ldr q15, [x17, #+1872] -ldr q21, [x17, #+1888] -ldr q8, [x17, #+1904] -ldr q26, [x0, #864] -ldr q1, [x0, #880] -ldr q9, [x0, #832] -ldr q30, [x0, #848] -sqrdmulh v18.4S, v26.4S, v16.s[0] -mul v26.4S, v26.4S,v25.s[0] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v26.4s -add v9.4s, v9.4s, v26.4s -sqrdmulh v26.4S, v1.4S, v16.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v1.4s -add v30.4s, v30.4s, v1.4s -sqrdmulh v1.4S, v30.4S, v16.s[1] -mul v30.4S, v30.4S,v25.s[1] -mla v30.4S, v1.4S, v31.s[0] -sub v1.4s, v9.4s, v30.4s -add v9.4s, v9.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v16.s[2] -mul v26.4S, v26.4S,v25.s[2] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -trn1 v26.4S, v9.4S, v1.4S -trn2 v2.4S, v9.4S, v1.4S -trn1 v5.4S, v18.4S, v30.4S -trn2 v4.4S, v18.4S, v30.4S -trn2 v18.2D, v26.2D, v5.2D -trn2 v30.2D, v2.2D, v4.2D -trn1 v9.2D, v26.2D, v5.2D -trn1 v1.2D, v2.2D, v4.2D -sqrdmulh v4.4S, v18.4S, v10.4S -mul v18.4S, v18.4S,v28.4S -mla v18.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v18.4s -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v10.4S -mul v30.4S, v30.4S,v28.4S -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -sqrdmulh v30.4S, v1.4S, v15.4S -mul v1.4S, v1.4S,v14.4S -mla v1.4S, v30.4S, v31.s[0] -sub v30.4s, v9.4s, v1.4s -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v8.4S -mul v18.4S, v18.4S,v21.4S -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v18.4s -add v4.4s, v4.4s, v18.4s -str q9, [x0, #832] -str q30, [x0, #848] -str q4, [x0, #864] -str q1, [x0, #880] -ldr q1, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q30, [x17, #+1952] -ldr q9, [x17, #+1968] -ldr q18, [x17, #+1984] -ldr q2, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q26, [x17, #+2032] -ldr q8, [x0, #928] -ldr q21, [x0, #944] -ldr q15, [x0, #896] -ldr q14, [x0, #912] -sqrdmulh v10.4S, v8.4S, v4.s[0] -mul v8.4S, v8.4S,v1.s[0] -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v4.s[0] -mul v21.4S, v21.4S,v1.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v4.s[1] -mul v14.4S, v14.4S,v1.s[1] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v8.4S, v4.s[2] -mul v8.4S, v8.4S,v1.s[2] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -trn1 v8.4S, v15.4S, v21.4S -trn2 v28.4S, v15.4S, v21.4S -trn1 v16.4S, v10.4S, v14.4S -trn2 v25.4S, v10.4S, v14.4S -trn2 v10.2D, v8.2D, v16.2D -trn2 v14.2D, v28.2D, v25.2D -trn1 v15.2D, v8.2D, v16.2D -trn1 v21.2D, v28.2D, v25.2D -sqrdmulh v25.4S, v10.4S, v9.4S -mul v10.4S, v10.4S,v30.4S -mla v10.4S, v25.4S, v31.s[0] -sub v25.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v14.4S, v9.4S -mul v14.4S, v14.4S,v30.4S -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v2.4S -mul v21.4S, v21.4S,v18.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v26.4S -mul v10.4S, v10.4S,v5.4S -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v25.4s, v10.4s -add v25.4s, v25.4s, v10.4s -str q15, [x0, #896] -str q14, [x0, #912] -str q25, [x0, #928] -str q21, [x0, #944] -ldr q21, [x17, #+2048] -ldr q25, [x17, #+2064] -ldr q14, [x17, #+2080] -ldr q15, [x17, #+2096] -ldr q10, [x17, #+2112] -ldr q28, [x17, #+2128] -ldr q16, [x17, #+2144] -ldr q8, [x17, #+2160] -ldr q26, [x0, #992] -ldr q5, [x0, #1008] -ldr q2, [x0, #960] -ldr q18, [x0, #976] -sqrdmulh v9.4S, v26.4S, v25.s[0] -mul v26.4S, v26.4S,v21.s[0] -mla v26.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v25.s[0] -mul v5.4S, v5.4S,v21.s[0] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v18.4s, v5.4s -add v18.4s, v18.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v25.s[1] -mul v18.4S, v18.4S,v21.s[1] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v25.s[2] -mul v26.4S, v26.4S,v21.s[2] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v26.4s -add v9.4s, v9.4s, v26.4s -trn1 v26.4S, v2.4S, v5.4S -trn2 v30.4S, v2.4S, v5.4S -trn1 v4.4S, v9.4S, v18.4S -trn2 v1.4S, v9.4S, v18.4S -trn2 v9.2D, v26.2D, v4.2D -trn2 v18.2D, v30.2D, v1.2D -trn1 v2.2D, v26.2D, v4.2D -trn1 v5.2D, v30.2D, v1.2D -sqrdmulh v1.4S, v9.4S, v15.4S -mul v9.4S, v9.4S,v14.4S -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v18.4S, v15.4S -mul v18.4S, v18.4S,v14.4S -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v18.4s -add v5.4s, v5.4s, v18.4s -sqrdmulh v18.4S, v5.4S, v28.4S -mul v5.4S, v5.4S,v10.4S -mla v5.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v8.4S -mul v9.4S, v9.4S,v16.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -str q2, [x0, #960] -str q18, [x0, #976] -str q1, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s deleted file mode 100644 index dae0130..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_14_0.s +++ /dev/null @@ -1,2506 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_14_0 -.global _ntt_u32_full_neon_asm_var_4_4_14_0 -ntt_u32_full_neon_asm_var_4_4_14_0: -_ntt_u32_full_neon_asm_var_4_4_14_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #928] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #992] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #800] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #864] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #544] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #608] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -nop -ldr q18, [x0, #672] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -nop -ldr q17, [x0, #736] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -nop -ldr q16, [x0, #416] -ldr q3, [x0, #480] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #288] -ldr q1, [x0, #352] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #32] -ldr q25, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #160] -ldr q15, [x0, #224] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v28.s[1] -nop -mul v16.4S, v16.4S,v29.s[1] -nop -sqrdmulh v21.4S, v3.4S, v28.s[1] -sub v14.4s, v26.4s, v20.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v30.4S, v30.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v1.4S, v1.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v2.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v3.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v27.4S, v28.s[2] -nop -mla v30.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v0.4S, v28.s[2] -nop -mla v1.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v23.4S, v28.s[2] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v2.4S, v2.4S,v29.s[2] -sub v10.4s, v24.4s, v16.4s -mul v27.4S, v27.4S,v29.s[2] -add v24.4s, v24.4s, v16.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v3.4s -mla v27.4S, v21.4S, v31.s[0] -add v15.4s, v15.4s, v3.4s -mul v0.4S, v0.4S,v29.s[2] -sub v3.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v29.s[2] -add v26.4s, v26.4s, v30.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v1.4s -mla v23.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v11.s[1] -nop -mul v10.4S, v10.4S,v17.s[1] -nop -sqrdmulh v19.4S, v18.4S, v11.s[1] -sub v30.4s, v12.4s, v2.4s -mul v18.4S, v18.4S,v17.s[1] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v11.s[0] -sub v21.4s, v22.4s, v27.4s -mul v24.4S, v24.4S,v17.s[0] -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v11.s[0] -sub v16.4s, v14.4s, v0.4s -mul v15.4S, v15.4S,v17.s[0] -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+64] -ldr q9, [x17, #+80] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v23.4s -sqrdmulh v8.4S, v12.4S, v11.s[2] -add v13.4s, v13.4s, v23.4s -mla v18.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v22.4S, v11.s[2] -nop -mla v24.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v30.4S, v11.s[3] -nop -mla v15.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v21.4S, v11.s[3] -nop -ldr q23, [x17, #+96] -ldr q7, [x17, #+112] -mul v12.4S, v12.4S,v17.s[2] -sub v6.4s, v3.4s, v10.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v10.4s -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v22.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v30.4S, v30.4S,v17.s[3] -sub v18.4s, v26.4s, v24.4s -mul v21.4S, v21.4S,v17.s[3] -add v26.4s, v26.4s, v24.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v15.4s -mla v21.4S, v27.4S, v31.s[0] -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v9.s[3] -nop -mul v8.4S, v8.4S,v0.s[3] -nop -sqrdmulh v27.4S, v20.4S, v9.s[2] -sub v24.4s, v14.4s, v12.4s -mul v20.4S, v20.4S,v0.s[2] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v9.s[1] -sub v19.4s, v13.4s, v22.4s -mul v2.4S, v2.4S,v0.s[1] -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[0] -sub v10.4s, v16.4s, v30.4s -mul v25.4S, v25.4S,v0.s[0] -add v16.4s, v16.4s, v30.4s -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -sqrdmulh v30.4S, v13.4S, v7.s[0] -add v1.4s, v1.4s, v21.4s -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v6.4s, v8.4s -sqrdmulh v21.4S, v19.4S, v7.s[1] -add v6.4s, v6.4s, v8.4s -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v20.4s -sqrdmulh v8.4S, v1.4S, v7.s[2] -add v3.4s, v3.4s, v20.4s -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v2.4s -sqrdmulh v20.4S, v15.4S, v7.s[3] -add v18.4s, v18.4s, v2.4s -mul v13.4S, v13.4S,v23.s[0] -sub v2.4s, v26.4s, v25.4s -mul v19.4S, v19.4S,v23.s[1] -add v26.4s, v26.4s, v25.4s -mla v13.4S, v30.4S, v31.s[0] -str q12, [x0, #352] -mla v19.4S, v21.4S, v31.s[0] -str q3, [x0, #288] -mul v1.4S, v1.4S,v23.s[2] -str q27, [x0, #480] -mul v15.4S, v15.4S,v23.s[3] -str q6, [x0, #416] -mla v1.4S, v8.4S, v31.s[0] -str q22, [x0, #224] -mla v15.4S, v20.4S, v31.s[0] -str q18, [x0, #160] -ldr q18, [x0, #944] -sqrdmulh v20.4S, v18.4S, v28.s[0] -str q2, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -str q26, [x0, #32] -ldr q26, [x0, #1008] -sqrdmulh v2.4S, v26.4S, v28.s[0] -sub v22.4s, v14.4s, v13.4s -str q22, [x0, #608] -mul v26.4S, v26.4S,v29.s[0] -add v14.4s, v14.4s, v13.4s -ldr q13, [x0, #816] -sqrdmulh v22.4S, v13.4S, v28.s[0] -sub v8.4s, v24.4s, v19.4s -str q14, [x0, #544] -mul v13.4S, v13.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #880] -sqrdmulh v14.4S, v19.4S, v28.s[0] -sub v6.4s, v16.4s, v1.4s -str q8, [x0, #736] -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v1.4s -ldr q1, [x0, #560] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v10.4s, v15.4s -str q24, [x0, #672] -sqrdmulh v24.4S, v1.4S, v28.s[0] -add v10.4s, v10.4s, v15.4s -ldr q15, [x0, #624] -mla v26.4S, v2.4S, v31.s[0] -str q6, [x0, #864] -sqrdmulh v6.4S, v15.4S, v28.s[0] -nop -ldr q2, [x0, #688] -mla v13.4S, v22.4S, v31.s[0] -str q16, [x0, #800] -sqrdmulh v16.4S, v2.4S, v28.s[0] -nop -ldr q22, [x0, #752] -mla v19.4S, v14.4S, v31.s[0] -str q20, [x0, #992] -sqrdmulh v20.4S, v22.4S, v28.s[0] -nop -ldr q14, [x0, #432] -ldr q8, [x0, #496] -mul v1.4S, v1.4S,v29.s[0] -sub v27.4s, v14.4s, v18.4s -str q10, [x0, #928] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v18.4s -ldr q18, [x0, #304] -ldr q10, [x0, #368] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v8.4s, v26.4s -mla v15.4S, v6.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -ldr q26, [x0, #48] -ldr q6, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -sub v3.4s, v18.4s, v13.4s -mul v22.4S, v22.4S,v29.s[0] -add v18.4s, v18.4s, v13.4s -ldr q13, [x0, #176] -ldr q21, [x0, #240] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v19.4s -mla v22.4S, v20.4S, v31.s[0] -add v10.4s, v10.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v28.s[1] -nop -mul v14.4S, v14.4S,v29.s[1] -nop -sqrdmulh v20.4S, v8.4S, v28.s[1] -sub v12.4s, v26.4s, v1.4s -mul v8.4S, v8.4S,v29.s[1] -add v26.4s, v26.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v28.s[1] -sub v30.4s, v6.4s, v15.4s -mul v18.4S, v18.4S,v29.s[1] -add v6.4s, v6.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v28.s[1] -sub v25.4s, v13.4s, v2.4s -mul v10.4S, v10.4S,v29.s[1] -add v13.4s, v13.4s, v2.4s -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v22.4s -sqrdmulh v2.4S, v27.4S, v28.s[2] -add v21.4s, v21.4s, v22.4s -mla v8.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v24.4S, v28.s[2] -nop -mla v18.4S, v1.4S, v31.s[0] -nop -sqrdmulh v1.4S, v3.4S, v28.s[2] -nop -mla v10.4S, v15.4S, v31.s[0] -nop -sqrdmulh v15.4S, v16.4S, v28.s[2] -nop -mul v27.4S, v27.4S,v29.s[2] -sub v22.4s, v13.4s, v14.4s -mul v24.4S, v24.4S,v29.s[2] -add v13.4s, v13.4s, v14.4s -mla v27.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v8.4s -mla v24.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v8.4s -mul v3.4S, v3.4S,v29.s[2] -sub v8.4s, v26.4s, v18.4s -mul v16.4S, v16.4S,v29.s[2] -add v26.4s, v26.4s, v18.4s -mla v3.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v11.s[1] -nop -mul v22.4S, v22.4S,v17.s[1] -nop -sqrdmulh v15.4S, v2.4S, v11.s[1] -sub v18.4s, v25.4s, v27.4s -mul v2.4S, v2.4S,v17.s[1] -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v11.s[0] -sub v20.4s, v19.4s, v24.4s -mul v13.4S, v13.4S,v17.s[0] -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v11.s[0] -sub v14.4s, v12.4s, v3.4s -mul v21.4S, v21.4S,v17.s[0] -add v12.4s, v12.4s, v3.4s -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v16.4s -sqrdmulh v3.4S, v25.4S, v11.s[2] -add v30.4s, v30.4s, v16.4s -mla v2.4S, v15.4S, v31.s[0] -nop -sqrdmulh v15.4S, v19.4S, v11.s[2] -nop -mla v13.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v18.4S, v11.s[3] -nop -mla v21.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v20.4S, v11.s[3] -nop -mul v25.4S, v25.4S,v17.s[2] -sub v16.4s, v8.4s, v22.4s -mul v19.4S, v19.4S,v17.s[2] -add v8.4s, v8.4s, v22.4s -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v2.4s -mla v19.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v2.4s -mul v18.4S, v18.4S,v17.s[3] -sub v2.4s, v26.4s, v13.4s -mul v20.4S, v20.4S,v17.s[3] -add v26.4s, v26.4s, v13.4s -mla v18.4S, v27.4S, v31.s[0] -sub v27.4s, v6.4s, v21.4s -mla v20.4S, v24.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v9.s[3] -nop -mul v3.4S, v3.4S,v0.s[3] -nop -sqrdmulh v24.4S, v1.4S, v9.s[2] -sub v13.4s, v12.4s, v25.4s -mul v1.4S, v1.4S,v0.s[2] -add v12.4s, v12.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v9.s[1] -sub v15.4s, v30.4s, v19.4s -mul v27.4S, v27.4S,v0.s[1] -add v30.4s, v30.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[0] -sub v22.4s, v14.4s, v18.4s -mul v6.4S, v6.4S,v0.s[0] -add v14.4s, v14.4s, v18.4s -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v20.4s -sqrdmulh v18.4S, v30.4S, v7.s[0] -add v10.4s, v10.4s, v20.4s -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v3.4s -sqrdmulh v20.4S, v15.4S, v7.s[1] -add v16.4s, v16.4s, v3.4s -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v8.4s, v1.4s -sqrdmulh v3.4S, v10.4S, v7.s[2] -add v8.4s, v8.4s, v1.4s -mla v6.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v27.4s -sqrdmulh v1.4S, v21.4S, v7.s[3] -add v2.4s, v2.4s, v27.4s -mul v30.4S, v30.4S,v23.s[0] -sub v27.4s, v26.4s, v6.4s -mul v15.4S, v15.4S,v23.s[1] -add v26.4s, v26.4s, v6.4s -mla v30.4S, v18.4S, v31.s[0] -str q25, [x0, #368] -mla v15.4S, v20.4S, v31.s[0] -str q8, [x0, #304] -mul v10.4S, v10.4S,v23.s[2] -str q24, [x0, #496] -mul v21.4S, v21.4S,v23.s[3] -str q16, [x0, #432] -mla v10.4S, v3.4S, v31.s[0] -str q19, [x0, #240] -mla v21.4S, v1.4S, v31.s[0] -str q2, [x0, #176] -ldr q2, [x0, #896] -sqrdmulh v1.4S, v2.4S, v28.s[0] -str q27, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -str q26, [x0, #48] -ldr q26, [x0, #960] -sqrdmulh v27.4S, v26.4S, v28.s[0] -sub v19.4s, v12.4s, v30.4s -str q19, [x0, #624] -mul v26.4S, v26.4S,v29.s[0] -add v12.4s, v12.4s, v30.4s -ldr q30, [x0, #768] -sqrdmulh v19.4S, v30.4S, v28.s[0] -sub v3.4s, v13.4s, v15.4s -str q12, [x0, #560] -mul v30.4S, v30.4S,v29.s[0] -add v13.4s, v13.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v12.4S, v15.4S, v28.s[0] -sub v16.4s, v14.4s, v10.4s -str q3, [x0, #752] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v10.4s -ldr q10, [x0, #512] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v21.4s -str q13, [x0, #688] -sqrdmulh v13.4S, v10.4S, v28.s[0] -add v22.4s, v22.4s, v21.4s -ldr q21, [x0, #576] -mla v26.4S, v27.4S, v31.s[0] -str q16, [x0, #880] -sqrdmulh v16.4S, v21.4S, v28.s[0] -nop -ldr q27, [x0, #640] -mla v30.4S, v19.4S, v31.s[0] -str q14, [x0, #816] -sqrdmulh v14.4S, v27.4S, v28.s[0] -nop -ldr q19, [x0, #704] -mla v15.4S, v12.4S, v31.s[0] -str q1, [x0, #1008] -sqrdmulh v1.4S, v19.4S, v28.s[0] -nop -ldr q12, [x0, #384] -ldr q3, [x0, #448] -mul v10.4S, v10.4S,v29.s[0] -sub v24.4s, v12.4s, v2.4s -str q22, [x0, #944] -mul v21.4S, v21.4S,v29.s[0] -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #256] -ldr q22, [x0, #320] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v3.4s, v26.4s -mla v21.4S, v16.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #0] -ldr q16, [x0, #64] -mul v27.4S, v27.4S,v29.s[0] -sub v8.4s, v2.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v2.4s, v2.4s, v30.4s -ldr q30, [x0, #128] -ldr q20, [x0, #192] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -mla v19.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v28.s[1] -nop -mul v12.4S, v12.4S,v29.s[1] -nop -sqrdmulh v1.4S, v3.4S, v28.s[1] -sub v25.4s, v26.4s, v10.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v28.s[1] -sub v18.4s, v16.4s, v21.4s -mul v2.4S, v2.4S,v29.s[1] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v28.s[1] -sub v6.4s, v30.4s, v27.4s -mul v22.4S, v22.4S,v29.s[1] -add v30.4s, v30.4s, v27.4s -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v19.4s -sqrdmulh v27.4S, v24.4S, v28.s[2] -add v20.4s, v20.4s, v19.4s -mla v3.4S, v1.4S, v31.s[0] -nop -sqrdmulh v1.4S, v13.4S, v28.s[2] -nop -mla v2.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v8.4S, v28.s[2] -nop -mla v22.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v14.4S, v28.s[2] -nop -mul v24.4S, v24.4S,v29.s[2] -sub v19.4s, v30.4s, v12.4s -mul v13.4S, v13.4S,v29.s[2] -add v30.4s, v30.4s, v12.4s -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v3.4s -mla v13.4S, v1.4S, v31.s[0] -add v20.4s, v20.4s, v3.4s -mul v8.4S, v8.4S,v29.s[2] -sub v3.4s, v26.4s, v2.4s -mul v14.4S, v14.4S,v29.s[2] -add v26.4s, v26.4s, v2.4s -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v22.4s -mla v14.4S, v21.4S, v31.s[0] -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v11.s[1] -nop -mul v19.4S, v19.4S,v17.s[1] -nop -sqrdmulh v21.4S, v27.4S, v11.s[1] -sub v2.4s, v6.4s, v24.4s -mul v27.4S, v27.4S,v17.s[1] -add v6.4s, v6.4s, v24.4s -sqrdmulh v24.4S, v30.4S, v11.s[0] -sub v1.4s, v15.4s, v13.4s -mul v30.4S, v30.4S,v17.s[0] -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v11.s[0] -sub v12.4s, v25.4s, v8.4s -mul v20.4S, v20.4S,v17.s[0] -add v25.4s, v25.4s, v8.4s -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v14.4s -sqrdmulh v8.4S, v6.4S, v11.s[2] -add v18.4s, v18.4s, v14.4s -mla v27.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v15.4S, v11.s[2] -nop -mla v30.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v2.4S, v11.s[3] -nop -mla v20.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v1.4S, v11.s[3] -nop -mul v6.4S, v6.4S,v17.s[2] -sub v14.4s, v3.4s, v19.4s -mul v15.4S, v15.4S,v17.s[2] -add v3.4s, v3.4s, v19.4s -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v27.4s -mla v15.4S, v21.4S, v31.s[0] -add v10.4s, v10.4s, v27.4s -mul v2.4S, v2.4S,v17.s[3] -sub v27.4s, v26.4s, v30.4s -mul v1.4S, v1.4S,v17.s[3] -add v26.4s, v26.4s, v30.4s -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v20.4s -mla v1.4S, v13.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v9.s[3] -nop -mul v8.4S, v8.4S,v0.s[3] -nop -sqrdmulh v13.4S, v10.4S, v9.s[2] -sub v30.4s, v25.4s, v6.4s -mul v10.4S, v10.4S,v0.s[2] -add v25.4s, v25.4s, v6.4s -sqrdmulh v6.4S, v24.4S, v9.s[1] -sub v21.4s, v18.4s, v15.4s -mul v24.4S, v24.4S,v0.s[1] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v9.s[0] -sub v19.4s, v12.4s, v2.4s -mul v16.4S, v16.4S,v0.s[0] -add v12.4s, v12.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v1.4s -sqrdmulh v2.4S, v18.4S, v7.s[0] -add v22.4s, v22.4s, v1.4s -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v8.4s -sqrdmulh v1.4S, v21.4S, v7.s[1] -add v14.4s, v14.4s, v8.4s -mla v24.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v10.4s -sqrdmulh v8.4S, v22.4S, v7.s[2] -add v3.4s, v3.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v27.4s, v24.4s -sqrdmulh v10.4S, v20.4S, v7.s[3] -add v27.4s, v27.4s, v24.4s -mul v18.4S, v18.4S,v23.s[0] -sub v24.4s, v26.4s, v16.4s -mul v21.4S, v21.4S,v23.s[1] -add v26.4s, v26.4s, v16.4s -mla v18.4S, v2.4S, v31.s[0] -str q6, [x0, #320] -mla v21.4S, v1.4S, v31.s[0] -str q3, [x0, #256] -mul v22.4S, v22.4S,v23.s[2] -str q13, [x0, #448] -mul v20.4S, v20.4S,v23.s[3] -str q14, [x0, #384] -mla v22.4S, v8.4S, v31.s[0] -str q15, [x0, #192] -mla v20.4S, v10.4S, v31.s[0] -str q27, [x0, #128] -ldr q27, [x0, #912] -sqrdmulh v10.4S, v27.4S, v28.s[0] -str q24, [x0, #64] -mul v27.4S, v27.4S,v29.s[0] -str q26, [x0, #0] -ldr q26, [x0, #976] -sqrdmulh v24.4S, v26.4S, v28.s[0] -sub v15.4s, v25.4s, v18.4s -str q15, [x0, #576] -mul v26.4S, v26.4S,v29.s[0] -add v25.4s, v25.4s, v18.4s -ldr q18, [x0, #784] -sqrdmulh v15.4S, v18.4S, v28.s[0] -sub v8.4s, v30.4s, v21.4s -str q25, [x0, #512] -mul v18.4S, v18.4S,v29.s[0] -add v30.4s, v30.4s, v21.4s -ldr q21, [x0, #848] -sqrdmulh v25.4S, v21.4S, v28.s[0] -sub v14.4s, v12.4s, v22.4s -str q8, [x0, #704] -mul v21.4S, v21.4S,v29.s[0] -add v12.4s, v12.4s, v22.4s -ldr q22, [x0, #528] -mla v27.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v20.4s -str q30, [x0, #640] -sqrdmulh v30.4S, v22.4S, v28.s[0] -add v19.4s, v19.4s, v20.4s -ldr q20, [x0, #592] -mla v26.4S, v24.4S, v31.s[0] -str q14, [x0, #832] -sqrdmulh v14.4S, v20.4S, v28.s[0] -nop -ldr q24, [x0, #656] -mla v18.4S, v15.4S, v31.s[0] -str q12, [x0, #768] -sqrdmulh v12.4S, v24.4S, v28.s[0] -nop -ldr q15, [x0, #720] -mla v21.4S, v25.4S, v31.s[0] -str q10, [x0, #960] -sqrdmulh v10.4S, v15.4S, v28.s[0] -nop -ldr q25, [x0, #400] -ldr q8, [x0, #464] -mul v22.4S, v22.4S,v29.s[0] -sub v13.4s, v25.4s, v27.4s -str q19, [x0, #896] -mul v20.4S, v20.4S,v29.s[0] -add v25.4s, v25.4s, v27.4s -ldr q27, [x0, #272] -ldr q19, [x0, #336] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v26.4s -mla v20.4S, v14.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -ldr q26, [x0, #16] -ldr q14, [x0, #80] -mul v24.4S, v24.4S,v29.s[0] -sub v3.4s, v27.4s, v18.4s -mul v15.4S, v15.4S,v29.s[0] -add v27.4s, v27.4s, v18.4s -ldr q18, [x0, #144] -ldr q1, [x0, #208] -mla v24.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v21.4s -mla v15.4S, v10.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v25.4S, v28.s[1] -nop -mul v25.4S, v25.4S,v29.s[1] -nop -sqrdmulh v10.4S, v8.4S, v28.s[1] -sub v6.4s, v26.4s, v22.4s -mul v8.4S, v8.4S,v29.s[1] -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v27.4S, v28.s[1] -sub v2.4s, v14.4s, v20.4s -mul v27.4S, v27.4S,v29.s[1] -add v14.4s, v14.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v28.s[1] -sub v16.4s, v18.4s, v24.4s -mul v19.4S, v19.4S,v29.s[1] -add v18.4s, v18.4s, v24.4s -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v15.4s -sqrdmulh v24.4S, v13.4S, v28.s[2] -add v1.4s, v1.4s, v15.4s -mla v8.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v30.4S, v28.s[2] -nop -mla v27.4S, v22.4S, v31.s[0] -nop -sqrdmulh v22.4S, v3.4S, v28.s[2] -nop -mla v19.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v12.4S, v28.s[2] -nop -mul v13.4S, v13.4S,v29.s[2] -sub v15.4s, v18.4s, v25.4s -mul v30.4S, v30.4S,v29.s[2] -add v18.4s, v18.4s, v25.4s -mla v13.4S, v24.4S, v31.s[0] -sub v24.4s, v1.4s, v8.4s -mla v30.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -mul v3.4S, v3.4S,v29.s[2] -sub v8.4s, v26.4s, v27.4s -mul v12.4S, v12.4S,v29.s[2] -add v26.4s, v26.4s, v27.4s -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v19.4s -mla v12.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v28.4S, v15.4S, v11.s[1] -nop -mul v15.4S, v15.4S,v17.s[1] -nop -sqrdmulh v29.4S, v24.4S, v11.s[1] -sub v19.4s, v16.4s, v13.4s -mul v24.4S, v24.4S,v17.s[1] -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v11.s[0] -sub v20.4s, v21.4s, v30.4s -mul v18.4S, v18.4S,v17.s[0] -add v21.4s, v21.4s, v30.4s -sqrdmulh v30.4S, v1.4S, v11.s[0] -sub v27.4s, v6.4s, v3.4s -mul v1.4S, v1.4S,v17.s[0] -add v6.4s, v6.4s, v3.4s -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v2.4s, v12.4s -sqrdmulh v3.4S, v16.4S, v11.s[2] -add v2.4s, v2.4s, v12.4s -mla v24.4S, v29.4S, v31.s[0] -nop -sqrdmulh v29.4S, v21.4S, v11.s[2] -nop -mla v18.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v19.4S, v11.s[3] -nop -mla v1.4S, v30.4S, v31.s[0] -nop -sqrdmulh v30.4S, v20.4S, v11.s[3] -nop -mul v16.4S, v16.4S,v17.s[2] -sub v12.4s, v8.4s, v15.4s -mul v21.4S, v21.4S,v17.s[2] -add v8.4s, v8.4s, v15.4s -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v24.4s -mla v21.4S, v29.4S, v31.s[0] -add v22.4s, v22.4s, v24.4s -mul v19.4S, v19.4S,v17.s[3] -sub v24.4s, v26.4s, v18.4s -mul v20.4S, v20.4S,v17.s[3] -add v26.4s, v26.4s, v18.4s -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v1.4s -mla v20.4S, v30.4S, v31.s[0] -add v14.4s, v14.4s, v1.4s -sqrdmulh v11.4S, v3.4S, v9.s[3] -nop -mul v3.4S, v3.4S,v0.s[3] -nop -sqrdmulh v17.4S, v22.4S, v9.s[2] -sub v1.4s, v6.4s, v16.4s -mul v22.4S, v22.4S,v0.s[2] -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v9.s[1] -sub v30.4s, v2.4s, v21.4s -mul v13.4S, v13.4S,v0.s[1] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v9.s[0] -sub v18.4s, v27.4s, v19.4s -mul v14.4S, v14.4S,v0.s[0] -add v27.4s, v27.4s, v19.4s -mla v3.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v20.4s -sqrdmulh v9.4S, v2.4S, v7.s[0] -add v28.4s, v28.4s, v20.4s -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v3.4s -sqrdmulh v20.4S, v30.4S, v7.s[1] -add v12.4s, v12.4s, v3.4s -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v22.4s -sqrdmulh v3.4S, v28.4S, v7.s[2] -add v8.4s, v8.4s, v22.4s -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v13.4s -sqrdmulh v22.4S, v11.4S, v7.s[3] -add v24.4s, v24.4s, v13.4s -mul v2.4S, v2.4S,v23.s[0] -sub v13.4s, v26.4s, v14.4s -mul v30.4S, v30.4S,v23.s[1] -add v26.4s, v26.4s, v14.4s -mla v2.4S, v9.4S, v31.s[0] -str q16, [x0, #336] -mla v30.4S, v20.4S, v31.s[0] -str q8, [x0, #272] -mul v28.4S, v28.4S,v23.s[2] -str q17, [x0, #464] -mul v11.4S, v11.4S,v23.s[3] -str q12, [x0, #400] -mla v28.4S, v3.4S, v31.s[0] -str q21, [x0, #208] -mla v11.4S, v22.4S, v31.s[0] -str q24, [x0, #144] -str q13, [x0, #80] -str q26, [x0, #16] -sub v26.4s, v6.4s, v2.4s -str q26, [x0, #592] -add v6.4s, v6.4s, v2.4s -sub v2.4s, v1.4s, v30.4s -str q6, [x0, #528] -add v1.4s, v1.4s, v30.4s -sub v30.4s, v27.4s, v28.4s -str q2, [x0, #720] -add v27.4s, v27.4s, v28.4s -sub v28.4s, v18.4s, v11.4s -str q1, [x0, #656] -add v18.4s, v18.4s, v11.4s -str q30, [x0, #848] -str q27, [x0, #784] -str q28, [x0, #976] -str q18, [x0, #912] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q25, [x17, #+160] -ldr q10, [x17, #+176] -ldr q15, [x17, #+192] -ldr q29, [x17, #+208] -ldr q19, [x17, #+224] -ldr q0, [x17, #+240] -ldr q14, [x0, #32] -ldr q9, [x0, #48] -ldr q16, [x0, #0] -ldr q20, [x0, #16] -sqrdmulh v8.4S, v14.4S, v5.s[0] -mul v14.4S, v14.4S,v4.s[0] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v9.4s -add v20.4s, v20.4s, v9.4s -sqrdmulh v9.4S, v20.4S, v5.s[1] -mul v20.4S, v20.4S,v4.s[1] -mla v20.4S, v9.4S, v31.s[0] -sub v9.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v5.s[2] -mul v14.4S, v14.4S,v4.s[2] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -trn1 v14.4S, v16.4S, v9.4S -trn2 v17.4S, v16.4S, v9.4S -trn1 v12.4S, v8.4S, v20.4S -trn2 v3.4S, v8.4S, v20.4S -trn2 v8.2D, v14.2D, v12.2D -trn2 v20.2D, v17.2D, v3.2D -trn1 v16.2D, v14.2D, v12.2D -trn1 v9.2D, v17.2D, v3.2D -sqrdmulh v3.4S, v8.4S, v10.4S -mul v8.4S, v8.4S,v25.4S -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v8.4s -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v20.4S, v10.4S -mul v20.4S, v20.4S,v25.4S -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v20.4s -add v9.4s, v9.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v15.4S -mla v9.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v0.4S -mul v8.4S, v8.4S,v19.4S -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -str q16, [x0, #0] -str q20, [x0, #16] -str q3, [x0, #32] -str q9, [x0, #48] -ldr q9, [x17, #+256] -ldr q3, [x17, #+272] -ldr q20, [x17, #+288] -ldr q16, [x17, #+304] -ldr q8, [x17, #+320] -ldr q17, [x17, #+336] -ldr q12, [x17, #+352] -ldr q14, [x17, #+368] -ldr q0, [x0, #96] -ldr q19, [x0, #112] -ldr q29, [x0, #64] -ldr q15, [x0, #80] -sqrdmulh v10.4S, v0.4S, v3.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v3.s[0] -mul v19.4S, v19.4S,v9.s[0] -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v15.4S, v3.s[1] -mul v15.4S, v15.4S,v9.s[1] -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v0.4S, v3.s[2] -mul v0.4S, v0.4S,v9.s[2] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -trn1 v0.4S, v29.4S, v19.4S -trn2 v25.4S, v29.4S, v19.4S -trn1 v5.4S, v10.4S, v15.4S -trn2 v4.4S, v10.4S, v15.4S -trn2 v10.2D, v0.2D, v5.2D -trn2 v15.2D, v25.2D, v4.2D -trn1 v29.2D, v0.2D, v5.2D -trn1 v19.2D, v25.2D, v4.2D -sqrdmulh v4.4S, v10.4S, v16.4S -mul v10.4S, v10.4S,v20.4S -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v10.4s -add v29.4s, v29.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v16.4S -mul v15.4S, v15.4S,v20.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v17.4S -mul v19.4S, v19.4S,v8.4S -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v19.4s -add v29.4s, v29.4s, v19.4s -sqrdmulh v19.4S, v10.4S, v14.4S -mul v10.4S, v10.4S,v12.4S -mla v10.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v10.4s -add v4.4s, v4.4s, v10.4s -str q29, [x0, #64] -str q15, [x0, #80] -str q4, [x0, #96] -str q19, [x0, #112] -ldr q19, [x17, #+384] -ldr q4, [x17, #+400] -ldr q15, [x17, #+416] -ldr q29, [x17, #+432] -ldr q10, [x17, #+448] -ldr q25, [x17, #+464] -ldr q5, [x17, #+480] -ldr q0, [x17, #+496] -ldr q14, [x0, #160] -ldr q12, [x0, #176] -ldr q17, [x0, #128] -ldr q8, [x0, #144] -sqrdmulh v16.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v19.s[0] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v19.s[0] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v12.4s -add v8.4s, v8.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v4.s[1] -mul v8.4S, v8.4S,v19.s[1] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -sqrdmulh v8.4S, v14.4S, v4.s[2] -mul v14.4S, v14.4S,v19.s[2] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -trn1 v14.4S, v17.4S, v12.4S -trn2 v20.4S, v17.4S, v12.4S -trn1 v3.4S, v16.4S, v8.4S -trn2 v9.4S, v16.4S, v8.4S -trn2 v16.2D, v14.2D, v3.2D -trn2 v8.2D, v20.2D, v9.2D -trn1 v17.2D, v14.2D, v3.2D -trn1 v12.2D, v20.2D, v9.2D -sqrdmulh v9.4S, v16.4S, v29.4S -mul v16.4S, v16.4S,v15.4S -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v29.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v8.4s -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.4S -mul v12.4S, v12.4S,v10.4S -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v0.4S -mul v16.4S, v16.4S,v5.4S -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -str q17, [x0, #128] -str q8, [x0, #144] -str q9, [x0, #160] -str q12, [x0, #176] -ldr q12, [x17, #+512] -ldr q9, [x17, #+528] -ldr q8, [x17, #+544] -ldr q17, [x17, #+560] -ldr q16, [x17, #+576] -ldr q20, [x17, #+592] -ldr q3, [x17, #+608] -ldr q14, [x17, #+624] -ldr q0, [x0, #224] -ldr q5, [x0, #240] -ldr q25, [x0, #192] -ldr q10, [x0, #208] -sqrdmulh v29.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v12.s[0] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v0.4s -add v25.4s, v25.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v12.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v9.s[1] -mul v10.4S, v10.4S,v12.s[1] -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v25.4s, v10.4s -add v25.4s, v25.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v9.s[2] -mul v0.4S, v0.4S,v12.s[2] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -trn1 v0.4S, v25.4S, v5.4S -trn2 v15.4S, v25.4S, v5.4S -trn1 v4.4S, v29.4S, v10.4S -trn2 v19.4S, v29.4S, v10.4S -trn2 v29.2D, v0.2D, v4.2D -trn2 v10.2D, v15.2D, v19.2D -trn1 v25.2D, v0.2D, v4.2D -trn1 v5.2D, v15.2D, v19.2D -sqrdmulh v19.4S, v29.4S, v17.4S -mul v29.4S, v29.4S,v8.4S -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -sqrdmulh v29.4S, v10.4S, v17.4S -mul v10.4S, v10.4S,v8.4S -mla v10.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v20.4S -mul v5.4S, v5.4S,v16.4S -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v25.4s, v5.4s -add v25.4s, v25.4s, v5.4s -sqrdmulh v5.4S, v29.4S, v14.4S -mul v29.4S, v29.4S,v3.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -str q25, [x0, #192] -str q10, [x0, #208] -str q19, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q19, [x17, #+656] -ldr q10, [x17, #+672] -ldr q25, [x17, #+688] -ldr q29, [x17, #+704] -ldr q15, [x17, #+720] -ldr q4, [x17, #+736] -ldr q0, [x17, #+752] -ldr q14, [x0, #288] -ldr q3, [x0, #304] -ldr q20, [x0, #256] -ldr q16, [x0, #272] -sqrdmulh v17.4S, v14.4S, v19.s[0] -mul v14.4S, v14.4S,v5.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v19.s[0] -mul v3.4S, v3.4S,v5.s[0] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v19.s[1] -mul v16.4S, v16.4S,v5.s[1] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v19.s[2] -mul v14.4S, v14.4S,v5.s[2] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -trn1 v14.4S, v20.4S, v3.4S -trn2 v8.4S, v20.4S, v3.4S -trn1 v9.4S, v17.4S, v16.4S -trn2 v12.4S, v17.4S, v16.4S -trn2 v17.2D, v14.2D, v9.2D -trn2 v16.2D, v8.2D, v12.2D -trn1 v20.2D, v14.2D, v9.2D -trn1 v3.2D, v8.2D, v12.2D -sqrdmulh v12.4S, v17.4S, v25.4S -mul v17.4S, v17.4S,v10.4S -mla v17.4S, v12.4S, v31.s[0] -sub v12.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v25.4S -mul v16.4S, v16.4S,v10.4S -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v15.4S -mul v3.4S, v3.4S,v29.4S -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v20.4s, v3.4s -add v20.4s, v20.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v0.4S -mul v17.4S, v17.4S,v4.4S -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -str q20, [x0, #256] -str q16, [x0, #272] -str q12, [x0, #288] -str q3, [x0, #304] -ldr q3, [x17, #+768] -ldr q12, [x17, #+784] -ldr q16, [x17, #+800] -ldr q20, [x17, #+816] -ldr q17, [x17, #+832] -ldr q8, [x17, #+848] -ldr q9, [x17, #+864] -ldr q14, [x17, #+880] -ldr q0, [x0, #352] -ldr q4, [x0, #368] -ldr q15, [x0, #320] -ldr q29, [x0, #336] -sqrdmulh v25.4S, v0.4S, v12.s[0] -mul v0.4S, v0.4S,v3.s[0] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v12.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v12.s[1] -mul v29.4S, v29.4S,v3.s[1] -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v12.s[2] -mul v0.4S, v0.4S,v3.s[2] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v0.4s -add v25.4s, v25.4s, v0.4s -trn1 v0.4S, v15.4S, v4.4S -trn2 v10.4S, v15.4S, v4.4S -trn1 v19.4S, v25.4S, v29.4S -trn2 v5.4S, v25.4S, v29.4S -trn2 v25.2D, v0.2D, v19.2D -trn2 v29.2D, v10.2D, v5.2D -trn1 v15.2D, v0.2D, v19.2D -trn1 v4.2D, v10.2D, v5.2D -sqrdmulh v5.4S, v25.4S, v20.4S -mul v25.4S, v25.4S,v16.4S -mla v25.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v25.4s -add v15.4s, v15.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v20.4S -mul v29.4S, v29.4S,v16.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v4.4s, v29.4s -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v8.4S -mul v4.4S, v4.4S,v17.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v14.4S -mul v25.4S, v25.4S,v9.4S -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v25.4s -add v5.4s, v5.4s, v25.4s -str q15, [x0, #320] -str q29, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q29, [x17, #+928] -ldr q15, [x17, #+944] -ldr q25, [x17, #+960] -ldr q10, [x17, #+976] -ldr q19, [x17, #+992] -ldr q0, [x17, #+1008] -ldr q14, [x0, #416] -ldr q9, [x0, #432] -ldr q8, [x0, #384] -ldr q17, [x0, #400] -sqrdmulh v20.4S, v14.4S, v5.s[0] -mul v14.4S, v14.4S,v4.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v5.s[1] -mul v17.4S, v17.4S,v4.s[1] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v17.4s -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v5.s[2] -mul v14.4S, v14.4S,v4.s[2] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -trn1 v14.4S, v8.4S, v9.4S -trn2 v16.4S, v8.4S, v9.4S -trn1 v12.4S, v20.4S, v17.4S -trn2 v3.4S, v20.4S, v17.4S -trn2 v20.2D, v14.2D, v12.2D -trn2 v17.2D, v16.2D, v3.2D -trn1 v8.2D, v14.2D, v12.2D -trn1 v9.2D, v16.2D, v3.2D -sqrdmulh v3.4S, v20.4S, v15.4S -mul v20.4S, v20.4S,v29.4S -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v15.4S -mul v17.4S, v17.4S,v29.4S -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v10.4S -mul v9.4S, v9.4S,v25.4S -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v20.4S, v0.4S -mul v20.4S, v20.4S,v19.4S -mla v20.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -str q8, [x0, #384] -str q17, [x0, #400] -str q3, [x0, #416] -str q9, [x0, #432] -ldr q9, [x17, #+1024] -ldr q3, [x17, #+1040] -ldr q17, [x17, #+1056] -ldr q8, [x17, #+1072] -ldr q20, [x17, #+1088] -ldr q16, [x17, #+1104] -ldr q12, [x17, #+1120] -ldr q14, [x17, #+1136] -ldr q0, [x0, #480] -ldr q19, [x0, #496] -ldr q10, [x0, #448] -ldr q25, [x0, #464] -sqrdmulh v15.4S, v0.4S, v3.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v3.s[0] -mul v19.4S, v19.4S,v9.s[0] -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v25.4s, v19.4s -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v3.s[1] -mul v25.4S, v25.4S,v9.s[1] -mla v25.4S, v19.4S, v31.s[0] -sub v19.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -sqrdmulh v25.4S, v0.4S, v3.s[2] -mul v0.4S, v0.4S,v9.s[2] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -trn1 v0.4S, v10.4S, v19.4S -trn2 v29.4S, v10.4S, v19.4S -trn1 v5.4S, v15.4S, v25.4S -trn2 v4.4S, v15.4S, v25.4S -trn2 v15.2D, v0.2D, v5.2D -trn2 v25.2D, v29.2D, v4.2D -trn1 v10.2D, v0.2D, v5.2D -trn1 v19.2D, v29.2D, v4.2D -sqrdmulh v4.4S, v15.4S, v8.4S -mul v15.4S, v15.4S,v17.4S -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v25.4S, v8.4S -mul v25.4S, v25.4S,v17.4S -mla v25.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v25.4s -add v19.4s, v19.4s, v25.4s -sqrdmulh v25.4S, v19.4S, v16.4S -mul v19.4S, v19.4S,v20.4S -mla v19.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v19.4s -add v10.4s, v10.4s, v19.4s -sqrdmulh v19.4S, v15.4S, v14.4S -mul v15.4S, v15.4S,v12.4S -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v15.4s -add v4.4s, v4.4s, v15.4s -str q10, [x0, #448] -str q25, [x0, #464] -str q4, [x0, #480] -str q19, [x0, #496] -ldr q19, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q25, [x17, #+1184] -ldr q10, [x17, #+1200] -ldr q15, [x17, #+1216] -ldr q29, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q0, [x17, #+1264] -ldr q14, [x0, #544] -ldr q12, [x0, #560] -ldr q16, [x0, #512] -ldr q20, [x0, #528] -sqrdmulh v8.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v19.s[0] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v19.s[0] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v4.s[1] -mul v20.4S, v20.4S,v19.s[1] -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v4.s[2] -mul v14.4S, v14.4S,v19.s[2] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -trn1 v14.4S, v16.4S, v12.4S -trn2 v17.4S, v16.4S, v12.4S -trn1 v3.4S, v8.4S, v20.4S -trn2 v9.4S, v8.4S, v20.4S -trn2 v8.2D, v14.2D, v3.2D -trn2 v20.2D, v17.2D, v9.2D -trn1 v16.2D, v14.2D, v3.2D -trn1 v12.2D, v17.2D, v9.2D -sqrdmulh v9.4S, v8.4S, v10.4S -mul v8.4S, v8.4S,v25.4S -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v16.4s, v8.4s -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v20.4S, v10.4S -mul v20.4S, v20.4S,v25.4S -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v20.4s -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v29.4S -mul v12.4S, v12.4S,v15.4S -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v0.4S -mul v8.4S, v8.4S,v5.4S -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -str q16, [x0, #512] -str q20, [x0, #528] -str q9, [x0, #544] -str q12, [x0, #560] -ldr q12, [x17, #+1280] -ldr q9, [x17, #+1296] -ldr q20, [x17, #+1312] -ldr q16, [x17, #+1328] -ldr q8, [x17, #+1344] -ldr q17, [x17, #+1360] -ldr q3, [x17, #+1376] -ldr q14, [x17, #+1392] -ldr q0, [x0, #608] -ldr q5, [x0, #624] -ldr q29, [x0, #576] -ldr q15, [x0, #592] -sqrdmulh v10.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v12.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v12.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v9.s[1] -mul v15.4S, v15.4S,v12.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v0.4S, v9.s[2] -mul v0.4S, v0.4S,v12.s[2] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -trn1 v0.4S, v29.4S, v5.4S -trn2 v25.4S, v29.4S, v5.4S -trn1 v4.4S, v10.4S, v15.4S -trn2 v19.4S, v10.4S, v15.4S -trn2 v10.2D, v0.2D, v4.2D -trn2 v15.2D, v25.2D, v19.2D -trn1 v29.2D, v0.2D, v4.2D -trn1 v5.2D, v25.2D, v19.2D -sqrdmulh v19.4S, v10.4S, v16.4S -mul v10.4S, v10.4S,v20.4S -mla v10.4S, v19.4S, v31.s[0] -sub v19.4s, v29.4s, v10.4s -add v29.4s, v29.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v16.4S -mul v15.4S, v15.4S,v20.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v15.4s -add v5.4s, v5.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v8.4S -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v5.4s -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v14.4S -mul v10.4S, v10.4S,v3.4S -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -str q29, [x0, #576] -str q15, [x0, #592] -str q19, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q19, [x17, #+1424] -ldr q15, [x17, #+1440] -ldr q29, [x17, #+1456] -ldr q10, [x17, #+1472] -ldr q25, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q0, [x17, #+1520] -ldr q14, [x0, #672] -ldr q3, [x0, #688] -ldr q17, [x0, #640] -ldr q8, [x0, #656] -sqrdmulh v16.4S, v14.4S, v19.s[0] -mul v14.4S, v14.4S,v5.s[0] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v19.s[0] -mul v3.4S, v3.4S,v5.s[0] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v8.4S, v19.s[1] -mul v8.4S, v8.4S,v5.s[1] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -sqrdmulh v8.4S, v14.4S, v19.s[2] -mul v14.4S, v14.4S,v5.s[2] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -trn1 v14.4S, v17.4S, v3.4S -trn2 v20.4S, v17.4S, v3.4S -trn1 v9.4S, v16.4S, v8.4S -trn2 v12.4S, v16.4S, v8.4S -trn2 v16.2D, v14.2D, v9.2D -trn2 v8.2D, v20.2D, v12.2D -trn1 v17.2D, v14.2D, v9.2D -trn1 v3.2D, v20.2D, v12.2D -sqrdmulh v12.4S, v16.4S, v29.4S -mul v16.4S, v16.4S,v15.4S -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v29.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -sqrdmulh v8.4S, v3.4S, v25.4S -mul v3.4S, v3.4S,v10.4S -mla v3.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v0.4S -mul v16.4S, v16.4S,v4.4S -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -str q17, [x0, #640] -str q8, [x0, #656] -str q12, [x0, #672] -str q3, [x0, #688] -ldr q3, [x17, #+1536] -ldr q12, [x17, #+1552] -ldr q8, [x17, #+1568] -ldr q17, [x17, #+1584] -ldr q16, [x17, #+1600] -ldr q20, [x17, #+1616] -ldr q9, [x17, #+1632] -ldr q14, [x17, #+1648] -ldr q0, [x0, #736] -ldr q4, [x0, #752] -ldr q25, [x0, #704] -ldr q10, [x0, #720] -sqrdmulh v29.4S, v0.4S, v12.s[0] -mul v0.4S, v0.4S,v3.s[0] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v0.4s -add v25.4s, v25.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v12.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v12.s[1] -mul v10.4S, v10.4S,v3.s[1] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v25.4s, v10.4s -add v25.4s, v25.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v12.s[2] -mul v0.4S, v0.4S,v3.s[2] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -trn1 v0.4S, v25.4S, v4.4S -trn2 v15.4S, v25.4S, v4.4S -trn1 v19.4S, v29.4S, v10.4S -trn2 v5.4S, v29.4S, v10.4S -trn2 v29.2D, v0.2D, v19.2D -trn2 v10.2D, v15.2D, v5.2D -trn1 v25.2D, v0.2D, v19.2D -trn1 v4.2D, v15.2D, v5.2D -sqrdmulh v5.4S, v29.4S, v17.4S -mul v29.4S, v29.4S,v8.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -sqrdmulh v29.4S, v10.4S, v17.4S -mul v10.4S, v10.4S,v8.4S -mla v10.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v10.4s -add v4.4s, v4.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v20.4S -mul v4.4S, v4.4S,v16.4S -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v25.4s, v4.4s -add v25.4s, v25.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v14.4S -mul v29.4S, v29.4S,v9.4S -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -str q25, [x0, #704] -str q10, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q10, [x17, #+1696] -ldr q25, [x17, #+1712] -ldr q29, [x17, #+1728] -ldr q15, [x17, #+1744] -ldr q19, [x17, #+1760] -ldr q0, [x17, #+1776] -ldr q14, [x0, #800] -ldr q9, [x0, #816] -ldr q20, [x0, #768] -ldr q16, [x0, #784] -sqrdmulh v17.4S, v14.4S, v5.s[0] -mul v14.4S, v14.4S,v4.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v5.s[1] -mul v16.4S, v16.4S,v4.s[1] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v5.s[2] -mul v14.4S, v14.4S,v4.s[2] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -trn1 v14.4S, v20.4S, v9.4S -trn2 v8.4S, v20.4S, v9.4S -trn1 v12.4S, v17.4S, v16.4S -trn2 v3.4S, v17.4S, v16.4S -trn2 v17.2D, v14.2D, v12.2D -trn2 v16.2D, v8.2D, v3.2D -trn1 v20.2D, v14.2D, v12.2D -trn1 v9.2D, v8.2D, v3.2D -sqrdmulh v3.4S, v17.4S, v25.4S -mul v17.4S, v17.4S,v10.4S -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v25.4S -mul v16.4S, v16.4S,v10.4S -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v15.4S -mul v9.4S, v9.4S,v29.4S -mla v9.4S, v16.4S, v31.s[0] -sub v16.4s, v20.4s, v9.4s -add v20.4s, v20.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v0.4S -mul v17.4S, v17.4S,v19.4S -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -str q20, [x0, #768] -str q16, [x0, #784] -str q3, [x0, #800] -str q9, [x0, #816] -ldr q9, [x17, #+1792] -ldr q3, [x17, #+1808] -ldr q16, [x17, #+1824] -ldr q20, [x17, #+1840] -ldr q17, [x17, #+1856] -ldr q8, [x17, #+1872] -ldr q12, [x17, #+1888] -ldr q14, [x17, #+1904] -ldr q0, [x0, #864] -ldr q19, [x0, #880] -ldr q15, [x0, #832] -ldr q29, [x0, #848] -sqrdmulh v25.4S, v0.4S, v3.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v3.s[0] -mul v19.4S, v19.4S,v9.s[0] -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v19.4s -add v29.4s, v29.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v3.s[1] -mul v29.4S, v29.4S,v9.s[1] -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v3.s[2] -mul v0.4S, v0.4S,v9.s[2] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v0.4s -add v25.4s, v25.4s, v0.4s -trn1 v0.4S, v15.4S, v19.4S -trn2 v10.4S, v15.4S, v19.4S -trn1 v5.4S, v25.4S, v29.4S -trn2 v4.4S, v25.4S, v29.4S -trn2 v25.2D, v0.2D, v5.2D -trn2 v29.2D, v10.2D, v4.2D -trn1 v15.2D, v0.2D, v5.2D -trn1 v19.2D, v10.2D, v4.2D -sqrdmulh v4.4S, v25.4S, v20.4S -mul v25.4S, v25.4S,v16.4S -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v25.4s -add v15.4s, v15.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v20.4S -mul v29.4S, v29.4S,v16.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -sqrdmulh v29.4S, v19.4S, v8.4S -mul v19.4S, v19.4S,v17.4S -mla v19.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v14.4S -mul v25.4S, v25.4S,v12.4S -mla v25.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -str q15, [x0, #832] -str q29, [x0, #848] -str q4, [x0, #864] -str q19, [x0, #880] -ldr q19, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q29, [x17, #+1952] -ldr q15, [x17, #+1968] -ldr q25, [x17, #+1984] -ldr q10, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q0, [x17, #+2032] -ldr q14, [x0, #928] -ldr q12, [x0, #944] -ldr q8, [x0, #896] -ldr q17, [x0, #912] -sqrdmulh v20.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v19.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v19.s[0] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v17.4S, v4.s[1] -mul v17.4S, v17.4S,v19.s[1] -mla v17.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v17.4s -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.s[2] -mul v14.4S, v14.4S,v19.s[2] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -trn1 v14.4S, v8.4S, v12.4S -trn2 v16.4S, v8.4S, v12.4S -trn1 v3.4S, v20.4S, v17.4S -trn2 v9.4S, v20.4S, v17.4S -trn2 v20.2D, v14.2D, v3.2D -trn2 v17.2D, v16.2D, v9.2D -trn1 v8.2D, v14.2D, v3.2D -trn1 v12.2D, v16.2D, v9.2D -sqrdmulh v9.4S, v20.4S, v15.4S -mul v20.4S, v20.4S,v29.4S -mla v20.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v15.4S -mul v17.4S, v17.4S,v29.4S -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v10.4S -mul v12.4S, v12.4S,v25.4S -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v12.4s -add v8.4s, v8.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v0.4S -mul v20.4S, v20.4S,v5.4S -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v9.4s, v20.4s -add v9.4s, v9.4s, v20.4s -str q8, [x0, #896] -str q17, [x0, #912] -str q9, [x0, #928] -str q12, [x0, #944] -ldr q12, [x17, #+2048] -ldr q9, [x17, #+2064] -ldr q17, [x17, #+2080] -ldr q8, [x17, #+2096] -ldr q20, [x17, #+2112] -ldr q16, [x17, #+2128] -ldr q3, [x17, #+2144] -ldr q14, [x17, #+2160] -ldr q0, [x0, #992] -ldr q5, [x0, #1008] -ldr q10, [x0, #960] -ldr q25, [x0, #976] -sqrdmulh v15.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v12.s[0] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v12.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v25.4s, v5.4s -add v25.4s, v25.4s, v5.4s -sqrdmulh v5.4S, v25.4S, v9.s[1] -mul v25.4S, v25.4S,v12.s[1] -mla v25.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -sqrdmulh v25.4S, v0.4S, v9.s[2] -mul v0.4S, v0.4S,v12.s[2] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -trn1 v0.4S, v10.4S, v5.4S -trn2 v29.4S, v10.4S, v5.4S -trn1 v4.4S, v15.4S, v25.4S -trn2 v19.4S, v15.4S, v25.4S -trn2 v15.2D, v0.2D, v4.2D -trn2 v25.2D, v29.2D, v19.2D -trn1 v10.2D, v0.2D, v4.2D -trn1 v5.2D, v29.2D, v19.2D -sqrdmulh v19.4S, v15.4S, v8.4S -mul v15.4S, v15.4S,v17.4S -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v25.4S, v8.4S -mul v25.4S, v25.4S,v17.4S -mla v25.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v25.4s -add v5.4s, v5.4s, v25.4s -sqrdmulh v25.4S, v5.4S, v16.4S -mul v5.4S, v5.4S,v20.4S -mla v5.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v14.4S -mul v15.4S, v15.4S,v3.4S -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -str q10, [x0, #960] -str q25, [x0, #976] -str q19, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2476 -// Instruction count: 2472 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s deleted file mode 100644 index eda1068..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_15_0.s +++ /dev/null @@ -1,2506 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_15_0 -.global _ntt_u32_full_neon_asm_var_4_4_15_0 -ntt_u32_full_neon_asm_var_4_4_15_0: -_ntt_u32_full_neon_asm_var_4_4_15_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #992] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #736] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #672] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -nop -ldr q18, [x0, #608] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -nop -ldr q17, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -nop -ldr q16, [x0, #480] -ldr q3, [x0, #416] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #352] -ldr q1, [x0, #288] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q25, [x0, #160] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v28.s[2] -nop -mul v2.4S, v2.4S,v29.s[2] -nop -sqrdmulh v21.4S, v27.4S, v28.s[2] -sub v14.4s, v26.4s, v20.4s -mul v27.4S, v27.4S,v29.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v28.s[2] -sub v13.4s, v25.4s, v19.4s -mul v0.4S, v0.4S,v29.s[2] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v23.4S, v28.s[2] -sub v12.4s, v24.4s, v18.4s -mul v23.4S, v23.4S,v29.s[2] -add v24.4s, v24.4s, v18.4s -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v16.4S, v28.s[1] -add v15.4s, v15.4s, v17.4s -mla v27.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v3.4S, v28.s[1] -nop -mla v0.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v30.4S, v28.s[1] -nop -mla v23.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v1.4S, v28.s[1] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v16.4S, v16.4S,v29.s[1] -sub v10.4s, v14.4s, v2.4s -mul v3.4S, v3.4S,v29.s[1] -add v14.4s, v14.4s, v2.4s -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v27.4s -mla v3.4S, v21.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -mul v30.4S, v30.4S,v29.s[1] -sub v27.4s, v12.4s, v0.4s -mul v1.4S, v1.4S,v29.s[1] -add v12.4s, v12.4s, v0.4s -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v23.4s -mla v1.4S, v19.4S, v31.s[0] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v10.4S, v11.s[3] -nop -mul v10.4S, v10.4S,v17.s[3] -nop -sqrdmulh v19.4S, v18.4S, v11.s[3] -sub v0.4s, v26.4s, v16.4s -mul v18.4S, v18.4S,v17.s[3] -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v11.s[2] -sub v21.4s, v25.4s, v3.4s -mul v14.4S, v14.4S,v17.s[2] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v13.4S, v11.s[2] -sub v2.4s, v24.4s, v30.4s -mul v13.4S, v13.4S,v17.s[2] -add v24.4s, v24.4s, v30.4s -ldr q30, [x17, #+96] -ldr q9, [x17, #+112] -mla v10.4S, v23.4S, v31.s[0] -sub v23.4s, v15.4s, v1.4s -sqrdmulh v8.4S, v0.4S, v11.s[1] -add v15.4s, v15.4s, v1.4s -mla v18.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v21.4S, v11.s[1] -nop -mla v14.4S, v16.4S, v31.s[0] -nop -sqrdmulh v16.4S, v26.4S, v11.s[0] -nop -mla v13.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v25.4S, v11.s[0] -nop -ldr q1, [x17, #+64] -ldr q7, [x17, #+80] -mul v0.4S, v0.4S,v17.s[1] -sub v6.4s, v27.4s, v10.4s -mul v21.4S, v21.4S,v17.s[1] -add v27.4s, v27.4s, v10.4s -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v21.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v26.4S, v26.4S,v17.s[0] -sub v18.4s, v12.4s, v14.4s -mul v25.4S, v25.4S,v17.s[0] -add v12.4s, v12.4s, v14.4s -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v13.4s -mla v25.4S, v3.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v6.4S, v9.s[3] -nop -mul v6.4S, v6.4S,v30.s[3] -nop -sqrdmulh v3.4S, v27.4S, v9.s[2] -sub v14.4s, v2.4s, v0.4s -mul v27.4S, v27.4S,v30.s[2] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v18.4S, v9.s[1] -sub v19.4s, v23.4s, v21.4s -mul v18.4S, v18.4S,v30.s[1] -add v23.4s, v23.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v9.s[0] -sub v10.4s, v24.4s, v26.4s -mul v12.4S, v12.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -mla v6.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v25.4s -sqrdmulh v26.4S, v14.4S, v7.s[3] -add v15.4s, v15.4s, v25.4s -mla v27.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v6.4s -sqrdmulh v25.4S, v2.4S, v7.s[2] -add v8.4s, v8.4s, v6.4s -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v27.4s -sqrdmulh v6.4S, v10.4S, v7.s[1] -add v20.4s, v20.4s, v27.4s -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v18.4s -sqrdmulh v27.4S, v24.4S, v7.s[0] -add v16.4s, v16.4s, v18.4s -mul v14.4S, v14.4S,v1.s[3] -sub v18.4s, v22.4s, v12.4s -mul v2.4S, v2.4S,v1.s[2] -add v22.4s, v22.4s, v12.4s -mla v14.4S, v26.4S, v31.s[0] -str q3, [x0, #992] -mla v2.4S, v25.4S, v31.s[0] -str q8, [x0, #928] -mul v10.4S, v10.4S,v1.s[1] -str q0, [x0, #864] -mul v24.4S, v24.4S,v1.s[0] -str q20, [x0, #800] -mla v10.4S, v6.4S, v31.s[0] -str q21, [x0, #736] -mla v24.4S, v27.4S, v31.s[0] -str q16, [x0, #672] -ldr q16, [x0, #1008] -sqrdmulh v27.4S, v16.4S, v28.s[0] -str q18, [x0, #608] -mul v16.4S, v16.4S,v29.s[0] -str q22, [x0, #544] -ldr q22, [x0, #944] -sqrdmulh v18.4S, v22.4S, v28.s[0] -sub v21.4s, v19.4s, v14.4s -str q21, [x0, #480] -mul v22.4S, v22.4S,v29.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #880] -sqrdmulh v21.4S, v14.4S, v28.s[0] -sub v6.4s, v23.4s, v2.4s -str q19, [x0, #416] -mul v14.4S, v14.4S,v29.s[0] -add v23.4s, v23.4s, v2.4s -ldr q2, [x0, #816] -sqrdmulh v19.4S, v2.4S, v28.s[0] -sub v20.4s, v13.4s, v10.4s -str q6, [x0, #352] -mul v2.4S, v2.4S,v29.s[0] -add v13.4s, v13.4s, v10.4s -ldr q10, [x0, #752] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v15.4s, v24.4s -str q23, [x0, #288] -sqrdmulh v23.4S, v10.4S, v28.s[0] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #688] -mla v22.4S, v18.4S, v31.s[0] -str q20, [x0, #224] -sqrdmulh v20.4S, v24.4S, v28.s[0] -nop -ldr q18, [x0, #624] -mla v14.4S, v21.4S, v31.s[0] -str q13, [x0, #160] -sqrdmulh v13.4S, v18.4S, v28.s[0] -nop -ldr q21, [x0, #560] -mla v2.4S, v19.4S, v31.s[0] -str q27, [x0, #96] -sqrdmulh v27.4S, v21.4S, v28.s[0] -nop -ldr q19, [x0, #496] -ldr q6, [x0, #432] -mul v10.4S, v10.4S,v29.s[0] -sub v0.4s, v19.4s, v16.4s -str q15, [x0, #32] -mul v24.4S, v24.4S,v29.s[0] -add v19.4s, v19.4s, v16.4s -ldr q16, [x0, #368] -ldr q15, [x0, #304] -mla v10.4S, v23.4S, v31.s[0] -sub v23.4s, v6.4s, v22.4s -mla v24.4S, v20.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -ldr q22, [x0, #240] -ldr q20, [x0, #176] -mul v18.4S, v18.4S,v29.s[0] -sub v8.4s, v16.4s, v14.4s -mul v21.4S, v21.4S,v29.s[0] -add v16.4s, v16.4s, v14.4s -ldr q14, [x0, #112] -ldr q25, [x0, #48] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v2.4s -mla v21.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v28.s[2] -nop -mul v0.4S, v0.4S,v29.s[2] -nop -sqrdmulh v27.4S, v23.4S, v28.s[2] -sub v3.4s, v22.4s, v10.4s -mul v23.4S, v23.4S,v29.s[2] -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v28.s[2] -sub v26.4s, v20.4s, v24.4s -mul v8.4S, v8.4S,v29.s[2] -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v13.4S, v28.s[2] -sub v12.4s, v14.4s, v18.4s -mul v13.4S, v13.4S,v29.s[2] -add v14.4s, v14.4s, v18.4s -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v21.4s -sqrdmulh v18.4S, v19.4S, v28.s[1] -add v25.4s, v25.4s, v21.4s -mla v23.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v6.4S, v28.s[1] -nop -mla v8.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v16.4S, v28.s[1] -nop -mla v13.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v15.4S, v28.s[1] -nop -mul v19.4S, v19.4S,v29.s[1] -sub v21.4s, v3.4s, v0.4s -mul v6.4S, v6.4S,v29.s[1] -add v3.4s, v3.4s, v0.4s -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v26.4s, v23.4s -mla v6.4S, v27.4S, v31.s[0] -add v26.4s, v26.4s, v23.4s -mul v16.4S, v16.4S,v29.s[1] -sub v23.4s, v12.4s, v8.4s -mul v15.4S, v15.4S,v29.s[1] -add v12.4s, v12.4s, v8.4s -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v13.4s -mla v15.4S, v24.4S, v31.s[0] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v11.s[3] -nop -mul v21.4S, v21.4S,v17.s[3] -nop -sqrdmulh v24.4S, v18.4S, v11.s[3] -sub v8.4s, v22.4s, v19.4s -mul v18.4S, v18.4S,v17.s[3] -add v22.4s, v22.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v11.s[2] -sub v27.4s, v20.4s, v6.4s -mul v3.4S, v3.4S,v17.s[2] -add v20.4s, v20.4s, v6.4s -sqrdmulh v6.4S, v26.4S, v11.s[2] -sub v0.4s, v14.4s, v16.4s -mul v26.4S, v26.4S,v17.s[2] -add v14.4s, v14.4s, v16.4s -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v25.4s, v15.4s -sqrdmulh v16.4S, v8.4S, v11.s[1] -add v25.4s, v25.4s, v15.4s -mla v18.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v27.4S, v11.s[1] -nop -mla v3.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v22.4S, v11.s[0] -nop -mla v26.4S, v6.4S, v31.s[0] -nop -sqrdmulh v6.4S, v20.4S, v11.s[0] -nop -mul v8.4S, v8.4S,v17.s[1] -sub v15.4s, v23.4s, v21.4s -mul v27.4S, v27.4S,v17.s[1] -add v23.4s, v23.4s, v21.4s -mla v8.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v18.4s -mla v27.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v18.4s -mul v22.4S, v22.4S,v17.s[0] -sub v18.4s, v12.4s, v3.4s -mul v20.4S, v20.4S,v17.s[0] -add v12.4s, v12.4s, v3.4s -mla v22.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v26.4s -mla v20.4S, v6.4S, v31.s[0] -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v15.4S, v9.s[3] -nop -mul v15.4S, v15.4S,v30.s[3] -nop -sqrdmulh v6.4S, v23.4S, v9.s[2] -sub v3.4s, v0.4s, v8.4s -mul v23.4S, v23.4S,v30.s[2] -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v9.s[1] -sub v24.4s, v13.4s, v27.4s -mul v18.4S, v18.4S,v30.s[1] -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v12.4S, v9.s[0] -sub v21.4s, v14.4s, v22.4s -mul v12.4S, v12.4S,v30.s[0] -add v14.4s, v14.4s, v22.4s -mla v15.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v20.4s -sqrdmulh v22.4S, v3.4S, v7.s[3] -add v25.4s, v25.4s, v20.4s -mla v23.4S, v6.4S, v31.s[0] -sub v6.4s, v16.4s, v15.4s -sqrdmulh v20.4S, v0.4S, v7.s[2] -add v16.4s, v16.4s, v15.4s -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v23.4s -sqrdmulh v15.4S, v21.4S, v7.s[1] -add v10.4s, v10.4s, v23.4s -mla v12.4S, v27.4S, v31.s[0] -sub v27.4s, v19.4s, v18.4s -sqrdmulh v23.4S, v14.4S, v7.s[0] -add v19.4s, v19.4s, v18.4s -mul v3.4S, v3.4S,v1.s[3] -sub v18.4s, v2.4s, v12.4s -mul v0.4S, v0.4S,v1.s[2] -add v2.4s, v2.4s, v12.4s -mla v3.4S, v22.4S, v31.s[0] -str q6, [x0, #1008] -mla v0.4S, v20.4S, v31.s[0] -str q16, [x0, #944] -mul v21.4S, v21.4S,v1.s[1] -str q8, [x0, #880] -mul v14.4S, v14.4S,v1.s[0] -str q10, [x0, #816] -mla v21.4S, v15.4S, v31.s[0] -str q27, [x0, #752] -mla v14.4S, v23.4S, v31.s[0] -str q19, [x0, #688] -ldr q19, [x0, #960] -sqrdmulh v23.4S, v19.4S, v28.s[0] -str q18, [x0, #624] -mul v19.4S, v19.4S,v29.s[0] -str q2, [x0, #560] -ldr q2, [x0, #896] -sqrdmulh v18.4S, v2.4S, v28.s[0] -sub v27.4s, v24.4s, v3.4s -str q27, [x0, #496] -mul v2.4S, v2.4S,v29.s[0] -add v24.4s, v24.4s, v3.4s -ldr q3, [x0, #832] -sqrdmulh v27.4S, v3.4S, v28.s[0] -sub v15.4s, v13.4s, v0.4s -str q24, [x0, #432] -mul v3.4S, v3.4S,v29.s[0] -add v13.4s, v13.4s, v0.4s -ldr q0, [x0, #768] -sqrdmulh v24.4S, v0.4S, v28.s[0] -sub v10.4s, v26.4s, v21.4s -str q15, [x0, #368] -mul v0.4S, v0.4S,v29.s[0] -add v26.4s, v26.4s, v21.4s -ldr q21, [x0, #704] -mla v19.4S, v23.4S, v31.s[0] -sub v23.4s, v25.4s, v14.4s -str q13, [x0, #304] -sqrdmulh v13.4S, v21.4S, v28.s[0] -add v25.4s, v25.4s, v14.4s -ldr q14, [x0, #640] -mla v2.4S, v18.4S, v31.s[0] -str q10, [x0, #240] -sqrdmulh v10.4S, v14.4S, v28.s[0] -nop -ldr q18, [x0, #576] -mla v3.4S, v27.4S, v31.s[0] -str q26, [x0, #176] -sqrdmulh v26.4S, v18.4S, v28.s[0] -nop -ldr q27, [x0, #512] -mla v0.4S, v24.4S, v31.s[0] -str q23, [x0, #112] -sqrdmulh v23.4S, v27.4S, v28.s[0] -nop -ldr q24, [x0, #448] -ldr q15, [x0, #384] -mul v21.4S, v21.4S,v29.s[0] -sub v8.4s, v24.4s, v19.4s -str q25, [x0, #48] -mul v14.4S, v14.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #320] -ldr q25, [x0, #256] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v2.4s -mla v14.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -ldr q2, [x0, #192] -ldr q10, [x0, #128] -mul v18.4S, v18.4S,v29.s[0] -sub v16.4s, v19.4s, v3.4s -mul v27.4S, v27.4S,v29.s[0] -add v19.4s, v19.4s, v3.4s -ldr q3, [x0, #64] -ldr q20, [x0, #0] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v0.4s -mla v27.4S, v23.4S, v31.s[0] -add v25.4s, v25.4s, v0.4s -sqrdmulh v0.4S, v8.4S, v28.s[2] -nop -mul v8.4S, v8.4S,v29.s[2] -nop -sqrdmulh v23.4S, v13.4S, v28.s[2] -sub v6.4s, v2.4s, v21.4s -mul v13.4S, v13.4S,v29.s[2] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v28.s[2] -sub v22.4s, v10.4s, v14.4s -mul v16.4S, v16.4S,v29.s[2] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v28.s[2] -sub v12.4s, v3.4s, v18.4s -mul v26.4S, v26.4S,v29.s[2] -add v3.4s, v3.4s, v18.4s -mla v8.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v27.4s -sqrdmulh v18.4S, v24.4S, v28.s[1] -add v20.4s, v20.4s, v27.4s -mla v13.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v15.4S, v28.s[1] -nop -mla v16.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v19.4S, v28.s[1] -nop -mla v26.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v25.4S, v28.s[1] -nop -mul v24.4S, v24.4S,v29.s[1] -sub v27.4s, v6.4s, v8.4s -mul v15.4S, v15.4S,v29.s[1] -add v6.4s, v6.4s, v8.4s -mla v24.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v13.4s -mla v15.4S, v23.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -mul v19.4S, v19.4S,v29.s[1] -sub v13.4s, v12.4s, v16.4s -mul v25.4S, v25.4S,v29.s[1] -add v12.4s, v12.4s, v16.4s -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v26.4s -mla v25.4S, v14.4S, v31.s[0] -add v0.4s, v0.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v11.s[3] -nop -mul v27.4S, v27.4S,v17.s[3] -nop -sqrdmulh v14.4S, v18.4S, v11.s[3] -sub v16.4s, v2.4s, v24.4s -mul v18.4S, v18.4S,v17.s[3] -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v6.4S, v11.s[2] -sub v23.4s, v10.4s, v15.4s -mul v6.4S, v6.4S,v17.s[2] -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v11.s[2] -sub v8.4s, v3.4s, v19.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v19.4s -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v25.4s -sqrdmulh v19.4S, v16.4S, v11.s[1] -add v20.4s, v20.4s, v25.4s -mla v18.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v23.4S, v11.s[1] -nop -mla v6.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v2.4S, v11.s[0] -nop -mla v22.4S, v15.4S, v31.s[0] -nop -sqrdmulh v15.4S, v10.4S, v11.s[0] -nop -mul v16.4S, v16.4S,v17.s[1] -sub v25.4s, v13.4s, v27.4s -mul v23.4S, v23.4S,v17.s[1] -add v13.4s, v13.4s, v27.4s -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v18.4s -mla v23.4S, v14.4S, v31.s[0] -add v21.4s, v21.4s, v18.4s -mul v2.4S, v2.4S,v17.s[0] -sub v18.4s, v12.4s, v6.4s -mul v10.4S, v10.4S,v17.s[0] -add v12.4s, v12.4s, v6.4s -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v0.4s, v22.4s -mla v10.4S, v15.4S, v31.s[0] -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[3] -nop -mul v25.4S, v25.4S,v30.s[3] -nop -sqrdmulh v15.4S, v13.4S, v9.s[2] -sub v6.4s, v8.4s, v16.4s -mul v13.4S, v13.4S,v30.s[2] -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v9.s[1] -sub v14.4s, v26.4s, v23.4s -mul v18.4S, v18.4S,v30.s[1] -add v26.4s, v26.4s, v23.4s -sqrdmulh v23.4S, v12.4S, v9.s[0] -sub v27.4s, v3.4s, v2.4s -mul v12.4S, v12.4S,v30.s[0] -add v3.4s, v3.4s, v2.4s -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v10.4s -sqrdmulh v2.4S, v6.4S, v7.s[3] -add v20.4s, v20.4s, v10.4s -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v25.4s -sqrdmulh v10.4S, v8.4S, v7.s[2] -add v19.4s, v19.4s, v25.4s -mla v18.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v13.4s -sqrdmulh v25.4S, v27.4S, v7.s[1] -add v21.4s, v21.4s, v13.4s -mla v12.4S, v23.4S, v31.s[0] -sub v23.4s, v24.4s, v18.4s -sqrdmulh v13.4S, v3.4S, v7.s[0] -add v24.4s, v24.4s, v18.4s -mul v6.4S, v6.4S,v1.s[3] -sub v18.4s, v0.4s, v12.4s -mul v8.4S, v8.4S,v1.s[2] -add v0.4s, v0.4s, v12.4s -mla v6.4S, v2.4S, v31.s[0] -str q15, [x0, #960] -mla v8.4S, v10.4S, v31.s[0] -str q19, [x0, #896] -mul v27.4S, v27.4S,v1.s[1] -str q16, [x0, #832] -mul v3.4S, v3.4S,v1.s[0] -str q21, [x0, #768] -mla v27.4S, v25.4S, v31.s[0] -str q23, [x0, #704] -mla v3.4S, v13.4S, v31.s[0] -str q24, [x0, #640] -ldr q24, [x0, #976] -sqrdmulh v13.4S, v24.4S, v28.s[0] -str q18, [x0, #576] -mul v24.4S, v24.4S,v29.s[0] -str q0, [x0, #512] -ldr q0, [x0, #912] -sqrdmulh v18.4S, v0.4S, v28.s[0] -sub v23.4s, v14.4s, v6.4s -str q23, [x0, #448] -mul v0.4S, v0.4S,v29.s[0] -add v14.4s, v14.4s, v6.4s -ldr q6, [x0, #848] -sqrdmulh v23.4S, v6.4S, v28.s[0] -sub v25.4s, v26.4s, v8.4s -str q14, [x0, #384] -mul v6.4S, v6.4S,v29.s[0] -add v26.4s, v26.4s, v8.4s -ldr q8, [x0, #784] -sqrdmulh v14.4S, v8.4S, v28.s[0] -sub v21.4s, v22.4s, v27.4s -str q25, [x0, #320] -mul v8.4S, v8.4S,v29.s[0] -add v22.4s, v22.4s, v27.4s -ldr q27, [x0, #720] -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v3.4s -str q26, [x0, #256] -sqrdmulh v26.4S, v27.4S, v28.s[0] -add v20.4s, v20.4s, v3.4s -ldr q3, [x0, #656] -mla v0.4S, v18.4S, v31.s[0] -str q21, [x0, #192] -sqrdmulh v21.4S, v3.4S, v28.s[0] -nop -ldr q18, [x0, #592] -mla v6.4S, v23.4S, v31.s[0] -str q22, [x0, #128] -sqrdmulh v22.4S, v18.4S, v28.s[0] -nop -ldr q23, [x0, #528] -mla v8.4S, v14.4S, v31.s[0] -str q13, [x0, #64] -sqrdmulh v13.4S, v23.4S, v28.s[0] -nop -ldr q14, [x0, #464] -ldr q25, [x0, #400] -mul v27.4S, v27.4S,v29.s[0] -sub v16.4s, v14.4s, v24.4s -str q20, [x0, #0] -mul v3.4S, v3.4S,v29.s[0] -add v14.4s, v14.4s, v24.4s -ldr q24, [x0, #336] -ldr q20, [x0, #272] -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v0.4s -mla v3.4S, v21.4S, v31.s[0] -add v25.4s, v25.4s, v0.4s -ldr q0, [x0, #208] -ldr q21, [x0, #144] -mul v18.4S, v18.4S,v29.s[0] -sub v19.4s, v24.4s, v6.4s -mul v23.4S, v23.4S,v29.s[0] -add v24.4s, v24.4s, v6.4s -ldr q6, [x0, #80] -ldr q10, [x0, #16] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v8.4s -mla v23.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v28.s[2] -nop -mul v16.4S, v16.4S,v29.s[2] -nop -sqrdmulh v13.4S, v26.4S, v28.s[2] -sub v15.4s, v0.4s, v27.4s -mul v26.4S, v26.4S,v29.s[2] -add v0.4s, v0.4s, v27.4s -sqrdmulh v27.4S, v19.4S, v28.s[2] -sub v2.4s, v21.4s, v3.4s -mul v19.4S, v19.4S,v29.s[2] -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v28.s[2] -sub v12.4s, v6.4s, v18.4s -mul v22.4S, v22.4S,v29.s[2] -add v6.4s, v6.4s, v18.4s -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v23.4s -sqrdmulh v18.4S, v14.4S, v28.s[1] -add v10.4s, v10.4s, v23.4s -mla v26.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v25.4S, v28.s[1] -nop -mla v19.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v24.4S, v28.s[1] -nop -mla v22.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v20.4S, v28.s[1] -nop -mul v14.4S, v14.4S,v29.s[1] -sub v23.4s, v15.4s, v16.4s -mul v25.4S, v25.4S,v29.s[1] -add v15.4s, v15.4s, v16.4s -mla v14.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v26.4s -mla v25.4S, v13.4S, v31.s[0] -add v2.4s, v2.4s, v26.4s -mul v24.4S, v24.4S,v29.s[1] -sub v26.4s, v12.4s, v19.4s -mul v20.4S, v20.4S,v29.s[1] -add v12.4s, v12.4s, v19.4s -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v8.4s, v22.4s -mla v20.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v22.4s -sqrdmulh v28.4S, v23.4S, v11.s[3] -nop -mul v23.4S, v23.4S,v17.s[3] -nop -sqrdmulh v29.4S, v18.4S, v11.s[3] -sub v22.4s, v0.4s, v14.4s -mul v18.4S, v18.4S,v17.s[3] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v11.s[2] -sub v3.4s, v21.4s, v25.4s -mul v15.4S, v15.4S,v17.s[2] -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v11.s[2] -sub v19.4s, v6.4s, v24.4s -mul v2.4S, v2.4S,v17.s[2] -add v6.4s, v6.4s, v24.4s -mla v23.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v20.4s -sqrdmulh v24.4S, v22.4S, v11.s[1] -add v10.4s, v10.4s, v20.4s -mla v18.4S, v29.4S, v31.s[0] -nop -sqrdmulh v29.4S, v3.4S, v11.s[1] -nop -mla v15.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v0.4S, v11.s[0] -nop -mla v2.4S, v25.4S, v31.s[0] -nop -sqrdmulh v25.4S, v21.4S, v11.s[0] -nop -mul v22.4S, v22.4S,v17.s[1] -sub v20.4s, v26.4s, v23.4s -mul v3.4S, v3.4S,v17.s[1] -add v26.4s, v26.4s, v23.4s -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v18.4s -mla v3.4S, v29.4S, v31.s[0] -add v27.4s, v27.4s, v18.4s -mul v0.4S, v0.4S,v17.s[0] -sub v18.4s, v12.4s, v15.4s -mul v21.4S, v21.4S,v17.s[0] -add v12.4s, v12.4s, v15.4s -mla v0.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v2.4s -mla v21.4S, v25.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v11.4S, v20.4S, v9.s[3] -nop -mul v20.4S, v20.4S,v30.s[3] -nop -sqrdmulh v17.4S, v26.4S, v9.s[2] -sub v2.4s, v19.4s, v22.4s -mul v26.4S, v26.4S,v30.s[2] -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v9.s[1] -sub v25.4s, v28.4s, v3.4s -mul v18.4S, v18.4S,v30.s[1] -add v28.4s, v28.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v9.s[0] -sub v15.4s, v6.4s, v0.4s -mul v12.4S, v12.4S,v30.s[0] -add v6.4s, v6.4s, v0.4s -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v21.4s -sqrdmulh v9.4S, v2.4S, v7.s[3] -add v10.4s, v10.4s, v21.4s -mla v26.4S, v17.4S, v31.s[0] -sub v17.4s, v24.4s, v20.4s -sqrdmulh v21.4S, v19.4S, v7.s[2] -add v24.4s, v24.4s, v20.4s -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v27.4s, v26.4s -sqrdmulh v20.4S, v15.4S, v7.s[1] -add v27.4s, v27.4s, v26.4s -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v18.4s -sqrdmulh v26.4S, v6.4S, v7.s[0] -add v14.4s, v14.4s, v18.4s -mul v2.4S, v2.4S,v1.s[3] -sub v18.4s, v8.4s, v12.4s -mul v19.4S, v19.4S,v1.s[2] -add v8.4s, v8.4s, v12.4s -mla v2.4S, v9.4S, v31.s[0] -str q17, [x0, #976] -mla v19.4S, v21.4S, v31.s[0] -str q24, [x0, #912] -mul v15.4S, v15.4S,v1.s[1] -str q22, [x0, #848] -mul v6.4S, v6.4S,v1.s[0] -str q27, [x0, #784] -mla v15.4S, v20.4S, v31.s[0] -str q3, [x0, #720] -mla v6.4S, v26.4S, v31.s[0] -str q14, [x0, #656] -str q18, [x0, #592] -str q8, [x0, #528] -sub v8.4s, v25.4s, v2.4s -str q8, [x0, #464] -add v25.4s, v25.4s, v2.4s -sub v2.4s, v28.4s, v19.4s -str q25, [x0, #400] -add v28.4s, v28.4s, v19.4s -sub v19.4s, v11.4s, v15.4s -str q2, [x0, #336] -add v11.4s, v11.4s, v15.4s -sub v15.4s, v10.4s, v6.4s -str q28, [x0, #272] -add v10.4s, v10.4s, v6.4s -str q19, [x0, #208] -str q11, [x0, #144] -str q15, [x0, #80] -str q10, [x0, #16] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q16, [x17, #+160] -ldr q13, [x17, #+176] -ldr q23, [x17, #+192] -ldr q29, [x17, #+208] -ldr q0, [x17, #+224] -ldr q30, [x17, #+240] -ldr q12, [x0, #32] -ldr q9, [x0, #48] -ldr q17, [x0, #0] -ldr q21, [x0, #16] -sqrdmulh v24.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v4.s[0] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v9.4s -add v21.4s, v21.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -trn1 v12.4S, v17.4S, v9.4S -trn2 v22.4S, v17.4S, v9.4S -trn1 v27.4S, v24.4S, v21.4S -trn2 v20.4S, v24.4S, v21.4S -trn2 v24.2D, v12.2D, v27.2D -trn2 v21.2D, v22.2D, v20.2D -trn1 v17.2D, v12.2D, v27.2D -trn1 v9.2D, v22.2D, v20.2D -sqrdmulh v20.4S, v24.4S, v13.4S -mul v24.4S, v24.4S,v16.4S -mla v24.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v24.4s -add v17.4s, v17.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v13.4S -mul v21.4S, v21.4S,v16.4S -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -sqrdmulh v21.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v23.4S -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v24.4S, v30.4S -mul v24.4S, v24.4S,v0.4S -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v24.4s -add v20.4s, v20.4s, v24.4s -str q17, [x0, #0] -str q21, [x0, #16] -str q20, [x0, #32] -str q9, [x0, #48] -ldr q9, [x17, #+256] -ldr q20, [x17, #+272] -ldr q21, [x17, #+288] -ldr q17, [x17, #+304] -ldr q24, [x17, #+320] -ldr q22, [x17, #+336] -ldr q27, [x17, #+352] -ldr q12, [x17, #+368] -ldr q30, [x0, #96] -ldr q0, [x0, #112] -ldr q29, [x0, #64] -ldr q23, [x0, #80] -sqrdmulh v13.4S, v30.4S, v20.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v0.4S, v20.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v30.4S, v31.s[0] -sub v30.4s, v23.4s, v0.4s -add v23.4s, v23.4s, v0.4s -sqrdmulh v0.4S, v23.4S, v20.s[1] -mul v23.4S, v23.4S,v9.s[1] -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v23.4s -add v29.4s, v29.4s, v23.4s -sqrdmulh v23.4S, v30.4S, v20.s[2] -mul v30.4S, v30.4S,v9.s[2] -mla v30.4S, v23.4S, v31.s[0] -sub v23.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -trn1 v30.4S, v29.4S, v0.4S -trn2 v16.4S, v29.4S, v0.4S -trn1 v5.4S, v13.4S, v23.4S -trn2 v4.4S, v13.4S, v23.4S -trn2 v13.2D, v30.2D, v5.2D -trn2 v23.2D, v16.2D, v4.2D -trn1 v29.2D, v30.2D, v5.2D -trn1 v0.2D, v16.2D, v4.2D -sqrdmulh v4.4S, v13.4S, v17.4S -mul v13.4S, v13.4S,v21.4S -mla v13.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v13.4s -add v29.4s, v29.4s, v13.4s -sqrdmulh v13.4S, v23.4S, v17.4S -mul v23.4S, v23.4S,v21.4S -mla v23.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v23.4s -add v0.4s, v0.4s, v23.4s -sqrdmulh v23.4S, v0.4S, v22.4S -mul v0.4S, v0.4S,v24.4S -mla v0.4S, v23.4S, v31.s[0] -sub v23.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v12.4S -mul v13.4S, v13.4S,v27.4S -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v13.4s -add v4.4s, v4.4s, v13.4s -str q29, [x0, #64] -str q23, [x0, #80] -str q4, [x0, #96] -str q0, [x0, #112] -ldr q0, [x17, #+384] -ldr q4, [x17, #+400] -ldr q23, [x17, #+416] -ldr q29, [x17, #+432] -ldr q13, [x17, #+448] -ldr q16, [x17, #+464] -ldr q5, [x17, #+480] -ldr q30, [x17, #+496] -ldr q12, [x0, #160] -ldr q27, [x0, #176] -ldr q22, [x0, #128] -ldr q24, [x0, #144] -sqrdmulh v17.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v0.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v4.s[0] -mul v27.4S, v27.4S,v0.s[0] -mla v27.4S, v12.4S, v31.s[0] -sub v12.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v4.s[1] -mul v24.4S, v24.4S,v0.s[1] -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v22.4s, v24.4s -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v4.s[2] -mul v12.4S, v12.4S,v0.s[2] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -trn1 v12.4S, v22.4S, v27.4S -trn2 v21.4S, v22.4S, v27.4S -trn1 v20.4S, v17.4S, v24.4S -trn2 v9.4S, v17.4S, v24.4S -trn2 v17.2D, v12.2D, v20.2D -trn2 v24.2D, v21.2D, v9.2D -trn1 v22.2D, v12.2D, v20.2D -trn1 v27.2D, v21.2D, v9.2D -sqrdmulh v9.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v23.4S -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v24.4S, v29.4S -mul v24.4S, v24.4S,v23.4S -mla v24.4S, v17.4S, v31.s[0] -sub v17.4s, v27.4s, v24.4s -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v27.4S, v16.4S -mul v27.4S, v27.4S,v13.4S -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v22.4s, v27.4s -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v5.4S -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -str q22, [x0, #128] -str q24, [x0, #144] -str q9, [x0, #160] -str q27, [x0, #176] -ldr q27, [x17, #+512] -ldr q9, [x17, #+528] -ldr q24, [x17, #+544] -ldr q22, [x17, #+560] -ldr q17, [x17, #+576] -ldr q21, [x17, #+592] -ldr q20, [x17, #+608] -ldr q12, [x17, #+624] -ldr q30, [x0, #224] -ldr q5, [x0, #240] -ldr q16, [x0, #192] -ldr q13, [x0, #208] -sqrdmulh v29.4S, v30.4S, v9.s[0] -mul v30.4S, v30.4S,v27.s[0] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v30.4s -add v16.4s, v16.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v9.s[1] -mul v13.4S, v13.4S,v27.s[1] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v27.s[2] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -trn1 v30.4S, v16.4S, v5.4S -trn2 v23.4S, v16.4S, v5.4S -trn1 v4.4S, v29.4S, v13.4S -trn2 v0.4S, v29.4S, v13.4S -trn2 v29.2D, v30.2D, v4.2D -trn2 v13.2D, v23.2D, v0.2D -trn1 v16.2D, v30.2D, v4.2D -trn1 v5.2D, v23.2D, v0.2D -sqrdmulh v0.4S, v29.4S, v22.4S -mul v29.4S, v29.4S,v24.4S -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v22.4S -mul v13.4S, v13.4S,v24.4S -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v13.4s -add v5.4s, v5.4s, v13.4s -sqrdmulh v13.4S, v5.4S, v21.4S -mul v5.4S, v5.4S,v17.4S -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v5.4s -add v16.4s, v16.4s, v5.4s -sqrdmulh v5.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v20.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v29.4s -add v0.4s, v0.4s, v29.4s -str q16, [x0, #192] -str q13, [x0, #208] -str q0, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q0, [x17, #+656] -ldr q13, [x17, #+672] -ldr q16, [x17, #+688] -ldr q29, [x17, #+704] -ldr q23, [x17, #+720] -ldr q4, [x17, #+736] -ldr q30, [x17, #+752] -ldr q12, [x0, #288] -ldr q20, [x0, #304] -ldr q21, [x0, #256] -ldr q17, [x0, #272] -sqrdmulh v22.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v5.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v0.s[0] -mul v20.4S, v20.4S,v5.s[0] -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v0.s[1] -mul v17.4S, v17.4S,v5.s[1] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v0.s[2] -mul v12.4S, v12.4S,v5.s[2] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -trn1 v12.4S, v21.4S, v20.4S -trn2 v24.4S, v21.4S, v20.4S -trn1 v9.4S, v22.4S, v17.4S -trn2 v27.4S, v22.4S, v17.4S -trn2 v22.2D, v12.2D, v9.2D -trn2 v17.2D, v24.2D, v27.2D -trn1 v21.2D, v12.2D, v9.2D -trn1 v20.2D, v24.2D, v27.2D -sqrdmulh v27.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v13.4S -mla v22.4S, v27.4S, v31.s[0] -sub v27.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v16.4S -mul v17.4S, v17.4S,v13.4S -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v23.4S -mul v20.4S, v20.4S,v29.4S -mla v20.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v22.4S, v30.4S -mul v22.4S, v22.4S,v4.4S -mla v22.4S, v20.4S, v31.s[0] -sub v20.4s, v27.4s, v22.4s -add v27.4s, v27.4s, v22.4s -str q21, [x0, #256] -str q17, [x0, #272] -str q27, [x0, #288] -str q20, [x0, #304] -ldr q20, [x17, #+768] -ldr q27, [x17, #+784] -ldr q17, [x17, #+800] -ldr q21, [x17, #+816] -ldr q22, [x17, #+832] -ldr q24, [x17, #+848] -ldr q9, [x17, #+864] -ldr q12, [x17, #+880] -ldr q30, [x0, #352] -ldr q4, [x0, #368] -ldr q23, [x0, #320] -ldr q29, [x0, #336] -sqrdmulh v16.4S, v30.4S, v27.s[0] -mul v30.4S, v30.4S,v20.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v23.4s, v30.4s -add v23.4s, v23.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v20.s[0] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v27.s[1] -mul v29.4S, v29.4S,v20.s[1] -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v23.4s, v29.4s -add v23.4s, v23.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v27.s[2] -mul v30.4S, v30.4S,v20.s[2] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v30.4s -add v16.4s, v16.4s, v30.4s -trn1 v30.4S, v23.4S, v4.4S -trn2 v13.4S, v23.4S, v4.4S -trn1 v0.4S, v16.4S, v29.4S -trn2 v5.4S, v16.4S, v29.4S -trn2 v16.2D, v30.2D, v0.2D -trn2 v29.2D, v13.2D, v5.2D -trn1 v23.2D, v30.2D, v0.2D -trn1 v4.2D, v13.2D, v5.2D -sqrdmulh v5.4S, v16.4S, v21.4S -mul v16.4S, v16.4S,v17.4S -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v23.4s, v16.4s -add v23.4s, v23.4s, v16.4s -sqrdmulh v16.4S, v29.4S, v21.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v4.4s, v29.4s -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v24.4S -mul v4.4S, v4.4S,v22.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v23.4s, v4.4s -add v23.4s, v23.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v12.4S -mul v16.4S, v16.4S,v9.4S -mla v16.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -str q23, [x0, #320] -str q29, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q29, [x17, #+928] -ldr q23, [x17, #+944] -ldr q16, [x17, #+960] -ldr q13, [x17, #+976] -ldr q0, [x17, #+992] -ldr q30, [x17, #+1008] -ldr q12, [x0, #416] -ldr q9, [x0, #432] -ldr q24, [x0, #384] -ldr q22, [x0, #400] -sqrdmulh v21.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v4.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v9.4s -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v22.4S, v5.s[1] -mul v22.4S, v22.4S,v4.s[1] -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v24.4s, v22.4s -add v24.4s, v24.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -trn1 v12.4S, v24.4S, v9.4S -trn2 v17.4S, v24.4S, v9.4S -trn1 v27.4S, v21.4S, v22.4S -trn2 v20.4S, v21.4S, v22.4S -trn2 v21.2D, v12.2D, v27.2D -trn2 v22.2D, v17.2D, v20.2D -trn1 v24.2D, v12.2D, v27.2D -trn1 v9.2D, v17.2D, v20.2D -sqrdmulh v20.4S, v21.4S, v23.4S -mul v21.4S, v21.4S,v29.4S -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v21.4s -add v24.4s, v24.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v23.4S -mul v22.4S, v22.4S,v29.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v22.4s -add v9.4s, v9.4s, v22.4s -sqrdmulh v22.4S, v9.4S, v13.4S -mul v9.4S, v9.4S,v16.4S -mla v9.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v9.4s -add v24.4s, v24.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v30.4S -mul v21.4S, v21.4S,v0.4S -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -str q24, [x0, #384] -str q22, [x0, #400] -str q20, [x0, #416] -str q9, [x0, #432] -ldr q9, [x17, #+1024] -ldr q20, [x17, #+1040] -ldr q22, [x17, #+1056] -ldr q24, [x17, #+1072] -ldr q21, [x17, #+1088] -ldr q17, [x17, #+1104] -ldr q27, [x17, #+1120] -ldr q12, [x17, #+1136] -ldr q30, [x0, #480] -ldr q0, [x0, #496] -ldr q13, [x0, #448] -ldr q16, [x0, #464] -sqrdmulh v23.4S, v30.4S, v20.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v23.4S, v31.s[0] -sub v23.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -sqrdmulh v30.4S, v0.4S, v20.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v30.4S, v31.s[0] -sub v30.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v20.s[1] -mul v16.4S, v16.4S,v9.s[1] -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v30.4S, v20.s[2] -mul v30.4S, v30.4S,v9.s[2] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v23.4s, v30.4s -add v23.4s, v23.4s, v30.4s -trn1 v30.4S, v13.4S, v0.4S -trn2 v29.4S, v13.4S, v0.4S -trn1 v5.4S, v23.4S, v16.4S -trn2 v4.4S, v23.4S, v16.4S -trn2 v23.2D, v30.2D, v5.2D -trn2 v16.2D, v29.2D, v4.2D -trn1 v13.2D, v30.2D, v5.2D -trn1 v0.2D, v29.2D, v4.2D -sqrdmulh v4.4S, v23.4S, v24.4S -mul v23.4S, v23.4S,v22.4S -mla v23.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v23.4s -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v24.4S -mul v16.4S, v16.4S,v22.4S -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v17.4S -mul v0.4S, v0.4S,v21.4S -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v0.4s -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v23.4S, v12.4S -mul v23.4S, v23.4S,v27.4S -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v23.4s -add v4.4s, v4.4s, v23.4s -str q13, [x0, #448] -str q16, [x0, #464] -str q4, [x0, #480] -str q0, [x0, #496] -ldr q0, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q16, [x17, #+1184] -ldr q13, [x17, #+1200] -ldr q23, [x17, #+1216] -ldr q29, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q30, [x17, #+1264] -ldr q12, [x0, #544] -ldr q27, [x0, #560] -ldr q17, [x0, #512] -ldr q21, [x0, #528] -sqrdmulh v24.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v0.s[0] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v4.s[0] -mul v27.4S, v27.4S,v0.s[0] -mla v27.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v27.4s -add v21.4s, v21.4s, v27.4s -sqrdmulh v27.4S, v21.4S, v4.s[1] -mul v21.4S, v21.4S,v0.s[1] -mla v21.4S, v27.4S, v31.s[0] -sub v27.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v4.s[2] -mul v12.4S, v12.4S,v0.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -trn1 v12.4S, v17.4S, v27.4S -trn2 v22.4S, v17.4S, v27.4S -trn1 v20.4S, v24.4S, v21.4S -trn2 v9.4S, v24.4S, v21.4S -trn2 v24.2D, v12.2D, v20.2D -trn2 v21.2D, v22.2D, v9.2D -trn1 v17.2D, v12.2D, v20.2D -trn1 v27.2D, v22.2D, v9.2D -sqrdmulh v9.4S, v24.4S, v13.4S -mul v24.4S, v24.4S,v16.4S -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v24.4s -add v17.4s, v17.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v13.4S -mul v21.4S, v21.4S,v16.4S -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v21.4s -add v27.4s, v27.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.4S -mul v27.4S, v27.4S,v23.4S -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v30.4S -mul v24.4S, v24.4S,v5.4S -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v24.4s -add v9.4s, v9.4s, v24.4s -str q17, [x0, #512] -str q21, [x0, #528] -str q9, [x0, #544] -str q27, [x0, #560] -ldr q27, [x17, #+1280] -ldr q9, [x17, #+1296] -ldr q21, [x17, #+1312] -ldr q17, [x17, #+1328] -ldr q24, [x17, #+1344] -ldr q22, [x17, #+1360] -ldr q20, [x17, #+1376] -ldr q12, [x17, #+1392] -ldr q30, [x0, #608] -ldr q5, [x0, #624] -ldr q29, [x0, #576] -ldr q23, [x0, #592] -sqrdmulh v13.4S, v30.4S, v9.s[0] -mul v30.4S, v30.4S,v27.s[0] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v23.4s, v5.4s -add v23.4s, v23.4s, v5.4s -sqrdmulh v5.4S, v23.4S, v9.s[1] -mul v23.4S, v23.4S,v27.s[1] -mla v23.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v23.4s -add v29.4s, v29.4s, v23.4s -sqrdmulh v23.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v27.s[2] -mla v30.4S, v23.4S, v31.s[0] -sub v23.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -trn1 v30.4S, v29.4S, v5.4S -trn2 v16.4S, v29.4S, v5.4S -trn1 v4.4S, v13.4S, v23.4S -trn2 v0.4S, v13.4S, v23.4S -trn2 v13.2D, v30.2D, v4.2D -trn2 v23.2D, v16.2D, v0.2D -trn1 v29.2D, v30.2D, v4.2D -trn1 v5.2D, v16.2D, v0.2D -sqrdmulh v0.4S, v13.4S, v17.4S -mul v13.4S, v13.4S,v21.4S -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v13.4s -add v29.4s, v29.4s, v13.4s -sqrdmulh v13.4S, v23.4S, v17.4S -mul v23.4S, v23.4S,v21.4S -mla v23.4S, v13.4S, v31.s[0] -sub v13.4s, v5.4s, v23.4s -add v5.4s, v5.4s, v23.4s -sqrdmulh v23.4S, v5.4S, v22.4S -mul v5.4S, v5.4S,v24.4S -mla v5.4S, v23.4S, v31.s[0] -sub v23.4s, v29.4s, v5.4s -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v12.4S -mul v13.4S, v13.4S,v20.4S -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -str q29, [x0, #576] -str q23, [x0, #592] -str q0, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q0, [x17, #+1424] -ldr q23, [x17, #+1440] -ldr q29, [x17, #+1456] -ldr q13, [x17, #+1472] -ldr q16, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q30, [x17, #+1520] -ldr q12, [x0, #672] -ldr q20, [x0, #688] -ldr q22, [x0, #640] -ldr q24, [x0, #656] -sqrdmulh v17.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v5.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v0.s[0] -mul v20.4S, v20.4S,v5.s[0] -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -sqrdmulh v20.4S, v24.4S, v0.s[1] -mul v24.4S, v24.4S,v5.s[1] -mla v24.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v24.4s -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v0.s[2] -mul v12.4S, v12.4S,v5.s[2] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -trn1 v12.4S, v22.4S, v20.4S -trn2 v21.4S, v22.4S, v20.4S -trn1 v9.4S, v17.4S, v24.4S -trn2 v27.4S, v17.4S, v24.4S -trn2 v17.2D, v12.2D, v9.2D -trn2 v24.2D, v21.2D, v27.2D -trn1 v22.2D, v12.2D, v9.2D -trn1 v20.2D, v21.2D, v27.2D -sqrdmulh v27.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v23.4S -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v24.4S, v29.4S -mul v24.4S, v24.4S,v23.4S -mla v24.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v24.4s -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v20.4S, v16.4S -mul v20.4S, v20.4S,v13.4S -mla v20.4S, v24.4S, v31.s[0] -sub v24.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v4.4S -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v27.4s, v17.4s -add v27.4s, v27.4s, v17.4s -str q22, [x0, #640] -str q24, [x0, #656] -str q27, [x0, #672] -str q20, [x0, #688] -ldr q20, [x17, #+1536] -ldr q27, [x17, #+1552] -ldr q24, [x17, #+1568] -ldr q22, [x17, #+1584] -ldr q17, [x17, #+1600] -ldr q21, [x17, #+1616] -ldr q9, [x17, #+1632] -ldr q12, [x17, #+1648] -ldr q30, [x0, #736] -ldr q4, [x0, #752] -ldr q16, [x0, #704] -ldr q13, [x0, #720] -sqrdmulh v29.4S, v30.4S, v27.s[0] -mul v30.4S, v30.4S,v20.s[0] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v30.4s -add v16.4s, v16.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v20.s[0] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v4.4s -add v13.4s, v13.4s, v4.4s -sqrdmulh v4.4S, v13.4S, v27.s[1] -mul v13.4S, v13.4S,v20.s[1] -mla v13.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v27.s[2] -mul v30.4S, v30.4S,v20.s[2] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -trn1 v30.4S, v16.4S, v4.4S -trn2 v23.4S, v16.4S, v4.4S -trn1 v0.4S, v29.4S, v13.4S -trn2 v5.4S, v29.4S, v13.4S -trn2 v29.2D, v30.2D, v0.2D -trn2 v13.2D, v23.2D, v5.2D -trn1 v16.2D, v30.2D, v0.2D -trn1 v4.2D, v23.2D, v5.2D -sqrdmulh v5.4S, v29.4S, v22.4S -mul v29.4S, v29.4S,v24.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v22.4S -mul v13.4S, v13.4S,v24.4S -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v13.4s -add v4.4s, v4.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v21.4S -mul v4.4S, v4.4S,v17.4S -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v4.4s -add v16.4s, v16.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v9.4S -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -str q16, [x0, #704] -str q13, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q13, [x17, #+1696] -ldr q16, [x17, #+1712] -ldr q29, [x17, #+1728] -ldr q23, [x17, #+1744] -ldr q0, [x17, #+1760] -ldr q30, [x17, #+1776] -ldr q12, [x0, #800] -ldr q9, [x0, #816] -ldr q21, [x0, #768] -ldr q17, [x0, #784] -sqrdmulh v22.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v4.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v5.s[1] -mul v17.4S, v17.4S,v4.s[1] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -trn1 v12.4S, v21.4S, v9.4S -trn2 v24.4S, v21.4S, v9.4S -trn1 v27.4S, v22.4S, v17.4S -trn2 v20.4S, v22.4S, v17.4S -trn2 v22.2D, v12.2D, v27.2D -trn2 v17.2D, v24.2D, v20.2D -trn1 v21.2D, v12.2D, v27.2D -trn1 v9.2D, v24.2D, v20.2D -sqrdmulh v20.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v13.4S -mla v22.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v16.4S -mul v17.4S, v17.4S,v13.4S -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v23.4S -mul v9.4S, v9.4S,v29.4S -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v9.4s -add v21.4s, v21.4s, v9.4s -sqrdmulh v9.4S, v22.4S, v30.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v22.4s -add v20.4s, v20.4s, v22.4s -str q21, [x0, #768] -str q17, [x0, #784] -str q20, [x0, #800] -str q9, [x0, #816] -ldr q9, [x17, #+1792] -ldr q20, [x17, #+1808] -ldr q17, [x17, #+1824] -ldr q21, [x17, #+1840] -ldr q22, [x17, #+1856] -ldr q24, [x17, #+1872] -ldr q27, [x17, #+1888] -ldr q12, [x17, #+1904] -ldr q30, [x0, #864] -ldr q0, [x0, #880] -ldr q23, [x0, #832] -ldr q29, [x0, #848] -sqrdmulh v16.4S, v30.4S, v20.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v23.4s, v30.4s -add v23.4s, v23.4s, v30.4s -sqrdmulh v30.4S, v0.4S, v20.s[0] -mul v0.4S, v0.4S,v9.s[0] -mla v0.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v29.4S, v20.s[1] -mul v29.4S, v29.4S,v9.s[1] -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v23.4s, v29.4s -add v23.4s, v23.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v20.s[2] -mul v30.4S, v30.4S,v9.s[2] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v30.4s -add v16.4s, v16.4s, v30.4s -trn1 v30.4S, v23.4S, v0.4S -trn2 v13.4S, v23.4S, v0.4S -trn1 v5.4S, v16.4S, v29.4S -trn2 v4.4S, v16.4S, v29.4S -trn2 v16.2D, v30.2D, v5.2D -trn2 v29.2D, v13.2D, v4.2D -trn1 v23.2D, v30.2D, v5.2D -trn1 v0.2D, v13.2D, v4.2D -sqrdmulh v4.4S, v16.4S, v21.4S -mul v16.4S, v16.4S,v17.4S -mla v16.4S, v4.4S, v31.s[0] -sub v4.4s, v23.4s, v16.4s -add v23.4s, v23.4s, v16.4s -sqrdmulh v16.4S, v29.4S, v21.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v29.4s -add v0.4s, v0.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v24.4S -mul v0.4S, v0.4S,v22.4S -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v23.4s, v0.4s -add v23.4s, v23.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v12.4S -mul v16.4S, v16.4S,v27.4S -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v16.4s -add v4.4s, v4.4s, v16.4s -str q23, [x0, #832] -str q29, [x0, #848] -str q4, [x0, #864] -str q0, [x0, #880] -ldr q0, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q29, [x17, #+1952] -ldr q23, [x17, #+1968] -ldr q16, [x17, #+1984] -ldr q13, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q30, [x17, #+2032] -ldr q12, [x0, #928] -ldr q27, [x0, #944] -ldr q24, [x0, #896] -ldr q22, [x0, #912] -sqrdmulh v21.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v0.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v4.s[0] -mul v27.4S, v27.4S,v0.s[0] -mla v27.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v27.4s -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v22.4S, v4.s[1] -mul v22.4S, v22.4S,v0.s[1] -mla v22.4S, v27.4S, v31.s[0] -sub v27.4s, v24.4s, v22.4s -add v24.4s, v24.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v4.s[2] -mul v12.4S, v12.4S,v0.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -trn1 v12.4S, v24.4S, v27.4S -trn2 v17.4S, v24.4S, v27.4S -trn1 v20.4S, v21.4S, v22.4S -trn2 v9.4S, v21.4S, v22.4S -trn2 v21.2D, v12.2D, v20.2D -trn2 v22.2D, v17.2D, v9.2D -trn1 v24.2D, v12.2D, v20.2D -trn1 v27.2D, v17.2D, v9.2D -sqrdmulh v9.4S, v21.4S, v23.4S -mul v21.4S, v21.4S,v29.4S -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v24.4s, v21.4s -add v24.4s, v24.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v23.4S -mul v22.4S, v22.4S,v29.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v27.4s, v22.4s -add v27.4s, v27.4s, v22.4s -sqrdmulh v22.4S, v27.4S, v13.4S -mul v27.4S, v27.4S,v16.4S -mla v27.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -sqrdmulh v27.4S, v21.4S, v30.4S -mul v21.4S, v21.4S,v5.4S -mla v21.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -str q24, [x0, #896] -str q22, [x0, #912] -str q9, [x0, #928] -str q27, [x0, #944] -ldr q27, [x17, #+2048] -ldr q9, [x17, #+2064] -ldr q22, [x17, #+2080] -ldr q24, [x17, #+2096] -ldr q21, [x17, #+2112] -ldr q17, [x17, #+2128] -ldr q20, [x17, #+2144] -ldr q12, [x17, #+2160] -ldr q30, [x0, #992] -ldr q5, [x0, #1008] -ldr q13, [x0, #960] -ldr q16, [x0, #976] -sqrdmulh v23.4S, v30.4S, v9.s[0] -mul v30.4S, v30.4S,v27.s[0] -mla v30.4S, v23.4S, v31.s[0] -sub v23.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v16.4s, v5.4s -add v16.4s, v16.4s, v5.4s -sqrdmulh v5.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v27.s[1] -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v27.s[2] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v23.4s, v30.4s -add v23.4s, v23.4s, v30.4s -trn1 v30.4S, v13.4S, v5.4S -trn2 v29.4S, v13.4S, v5.4S -trn1 v4.4S, v23.4S, v16.4S -trn2 v0.4S, v23.4S, v16.4S -trn2 v23.2D, v30.2D, v4.2D -trn2 v16.2D, v29.2D, v0.2D -trn1 v13.2D, v30.2D, v4.2D -trn1 v5.2D, v29.2D, v0.2D -sqrdmulh v0.4S, v23.4S, v24.4S -mul v23.4S, v23.4S,v22.4S -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v23.4s -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v24.4S -mul v16.4S, v16.4S,v22.4S -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v21.4S -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -sqrdmulh v5.4S, v23.4S, v12.4S -mul v23.4S, v23.4S,v20.4S -mla v23.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v23.4s -add v0.4s, v0.4s, v23.4s -str q13, [x0, #960] -str q16, [x0, #976] -str q0, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2476 -// Instruction count: 2472 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s deleted file mode 100644 index 2b07129..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_16_0.s +++ /dev/null @@ -1,2506 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_16_0 -.global _ntt_u32_full_neon_asm_var_4_4_16_0 -ntt_u32_full_neon_asm_var_4_4_16_0: -_ntt_u32_full_neon_asm_var_4_4_16_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #992] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #736] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #672] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -nop -ldr q18, [x0, #608] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -nop -ldr q17, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v17.4S, v28.s[0] -ldr q16, [x0, #480] -ldr q3, [x0, #416] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #352] -ldr q1, [x0, #288] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q25, [x0, #160] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v28.s[2] -nop -mul v2.4S, v2.4S,v29.s[2] -nop -sqrdmulh v21.4S, v27.4S, v28.s[2] -sub v14.4s, v26.4s, v20.4s -mul v27.4S, v27.4S,v29.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v16.4S, v16.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v3.4S, v3.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v0.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v27.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v23.4S, v28.s[2] -nop -mla v16.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v30.4S, v28.s[1] -nop -mla v3.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v1.4S, v28.s[1] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v0.4S, v0.4S,v29.s[2] -sub v10.4s, v14.4s, v2.4s -mul v23.4S, v23.4S,v29.s[2] -add v14.4s, v14.4s, v2.4s -mla v0.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v27.4s -mla v23.4S, v21.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -mul v30.4S, v30.4S,v29.s[1] -sub v27.4s, v26.4s, v16.4s -mul v1.4S, v1.4S,v29.s[1] -add v26.4s, v26.4s, v16.4s -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v3.4s -mla v1.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v11.s[3] -nop -mul v10.4S, v10.4S,v17.s[3] -nop -sqrdmulh v19.4S, v14.4S, v11.s[2] -sub v16.4s, v12.4s, v0.4s -mul v14.4S, v14.4S,v17.s[2] -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v11.s[1] -sub v21.4s, v22.4s, v23.4s -mul v27.4S, v27.4S,v17.s[1] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v11.s[0] -sub v2.4s, v24.4s, v30.4s -mul v26.4S, v26.4S,v17.s[0] -add v24.4s, v24.4s, v30.4s -ldr q30, [x17, #+96] -ldr q9, [x17, #+112] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v1.4s -sqrdmulh v8.4S, v18.4S, v11.s[3] -add v15.4s, v15.4s, v1.4s -mla v14.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v13.4S, v11.s[2] -nop -mla v27.4S, v0.4S, v31.s[0] -nop -sqrdmulh v0.4S, v20.4S, v11.s[1] -nop -mla v26.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v25.4S, v11.s[0] -nop -ldr q1, [x17, #+64] -ldr q7, [x17, #+80] -mul v18.4S, v18.4S,v17.s[3] -sub v6.4s, v16.4s, v10.4s -mul v13.4S, v13.4S,v17.s[2] -add v16.4s, v16.4s, v10.4s -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v14.4s -mla v13.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v14.4s -mul v20.4S, v20.4S,v17.s[1] -sub v14.4s, v2.4s, v27.4s -mul v25.4S, v25.4S,v17.s[0] -add v2.4s, v2.4s, v27.4s -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v26.4s -mla v25.4S, v23.4S, v31.s[0] -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[3] -nop -mul v6.4S, v6.4S,v30.s[3] -nop -sqrdmulh v23.4S, v16.4S, v9.s[2] -sub v27.4s, v21.4s, v18.4s -mul v16.4S, v16.4S,v30.s[2] -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v9.s[1] -sub v19.4s, v22.4s, v13.4s -mul v8.4S, v8.4S,v30.s[1] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v9.s[0] -sub v10.4s, v3.4s, v20.4s -mul v12.4S, v12.4S,v30.s[0] -add v3.4s, v3.4s, v20.4s -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v25.4s -sqrdmulh v20.4S, v14.4S, v7.s[3] -add v15.4s, v15.4s, v25.4s -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v27.4s, v6.4s -sqrdmulh v25.4S, v2.4S, v7.s[2] -add v27.4s, v27.4s, v6.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v16.4s -sqrdmulh v6.4S, v0.4S, v7.s[1] -add v21.4s, v21.4s, v16.4s -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v19.4s, v8.4s -sqrdmulh v16.4S, v24.4S, v7.s[0] -add v19.4s, v19.4s, v8.4s -mul v14.4S, v14.4S,v1.s[3] -sub v8.4s, v22.4s, v12.4s -mul v2.4S, v2.4S,v1.s[2] -add v22.4s, v22.4s, v12.4s -mla v14.4S, v20.4S, v31.s[0] -str q23, [x0, #992] -mla v2.4S, v25.4S, v31.s[0] -str q27, [x0, #928] -mul v0.4S, v0.4S,v1.s[1] -str q18, [x0, #864] -mul v24.4S, v24.4S,v1.s[0] -str q21, [x0, #800] -mla v0.4S, v6.4S, v31.s[0] -str q13, [x0, #736] -mla v24.4S, v16.4S, v31.s[0] -str q19, [x0, #672] -ldr q19, [x0, #1008] -sqrdmulh v16.4S, v19.4S, v28.s[0] -str q8, [x0, #608] -mul v19.4S, v19.4S,v29.s[0] -str q22, [x0, #544] -ldr q22, [x0, #944] -sqrdmulh v8.4S, v22.4S, v28.s[0] -sub v13.4s, v10.4s, v14.4s -str q13, [x0, #480] -mul v22.4S, v22.4S,v29.s[0] -add v10.4s, v10.4s, v14.4s -ldr q14, [x0, #880] -sqrdmulh v13.4S, v14.4S, v28.s[0] -sub v6.4s, v3.4s, v2.4s -str q10, [x0, #416] -mul v14.4S, v14.4S,v29.s[0] -add v3.4s, v3.4s, v2.4s -ldr q2, [x0, #816] -sqrdmulh v10.4S, v2.4S, v28.s[0] -sub v21.4s, v26.4s, v0.4s -str q6, [x0, #352] -mul v2.4S, v2.4S,v29.s[0] -add v26.4s, v26.4s, v0.4s -ldr q0, [x0, #752] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v15.4s, v24.4s -str q3, [x0, #288] -sqrdmulh v3.4S, v0.4S, v28.s[0] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #688] -mla v22.4S, v8.4S, v31.s[0] -str q21, [x0, #224] -sqrdmulh v21.4S, v24.4S, v28.s[0] -nop -ldr q8, [x0, #624] -mla v14.4S, v13.4S, v31.s[0] -str q26, [x0, #160] -sqrdmulh v26.4S, v8.4S, v28.s[0] -nop -ldr q13, [x0, #560] -mla v2.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v13.4S, v28.s[0] -str q16, [x0, #96] -ldr q16, [x0, #496] -ldr q6, [x0, #432] -mul v0.4S, v0.4S,v29.s[0] -sub v18.4s, v16.4s, v19.4s -str q15, [x0, #32] -mul v24.4S, v24.4S,v29.s[0] -add v16.4s, v16.4s, v19.4s -ldr q19, [x0, #368] -ldr q15, [x0, #304] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v22.4s -mla v24.4S, v21.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -ldr q22, [x0, #240] -ldr q21, [x0, #176] -mul v8.4S, v8.4S,v29.s[0] -sub v27.4s, v19.4s, v14.4s -mul v13.4S, v13.4S,v29.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #112] -ldr q25, [x0, #48] -mla v8.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v2.4s -mla v13.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v18.4S, v28.s[2] -nop -mul v18.4S, v18.4S,v29.s[2] -nop -sqrdmulh v10.4S, v3.4S, v28.s[2] -sub v23.4s, v22.4s, v0.4s -mul v3.4S, v3.4S,v29.s[2] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v28.s[1] -sub v20.4s, v21.4s, v24.4s -mul v16.4S, v16.4S,v29.s[1] -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v6.4S, v28.s[1] -sub v12.4s, v14.4s, v8.4s -mul v6.4S, v6.4S,v29.s[1] -add v14.4s, v14.4s, v8.4s -mla v18.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v13.4s -sqrdmulh v8.4S, v27.4S, v28.s[2] -add v25.4s, v25.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v26.4S, v28.s[2] -nop -mla v16.4S, v0.4S, v31.s[0] -nop -sqrdmulh v0.4S, v19.4S, v28.s[1] -nop -mla v6.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v15.4S, v28.s[1] -nop -mul v27.4S, v27.4S,v29.s[2] -sub v13.4s, v23.4s, v18.4s -mul v26.4S, v26.4S,v29.s[2] -add v23.4s, v23.4s, v18.4s -mla v27.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v3.4s -mla v26.4S, v10.4S, v31.s[0] -add v20.4s, v20.4s, v3.4s -mul v19.4S, v19.4S,v29.s[1] -sub v3.4s, v22.4s, v16.4s -mul v15.4S, v15.4S,v29.s[1] -add v22.4s, v22.4s, v16.4s -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v6.4s -mla v15.4S, v24.4S, v31.s[0] -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v11.s[3] -nop -mul v13.4S, v13.4S,v17.s[3] -nop -sqrdmulh v24.4S, v23.4S, v11.s[2] -sub v16.4s, v12.4s, v27.4s -mul v23.4S, v23.4S,v17.s[2] -add v12.4s, v12.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v11.s[1] -sub v10.4s, v2.4s, v26.4s -mul v3.4S, v3.4S,v17.s[1] -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v22.4S, v11.s[0] -sub v18.4s, v14.4s, v19.4s -mul v22.4S, v22.4S,v17.s[0] -add v14.4s, v14.4s, v19.4s -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v25.4s, v15.4s -sqrdmulh v19.4S, v8.4S, v11.s[3] -add v25.4s, v25.4s, v15.4s -mla v23.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v20.4S, v11.s[2] -nop -mla v3.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v0.4S, v11.s[1] -nop -mla v22.4S, v26.4S, v31.s[0] -nop -sqrdmulh v26.4S, v21.4S, v11.s[0] -nop -mul v8.4S, v8.4S,v17.s[3] -sub v15.4s, v16.4s, v13.4s -mul v20.4S, v20.4S,v17.s[2] -add v16.4s, v16.4s, v13.4s -mla v8.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v23.4s -mla v20.4S, v24.4S, v31.s[0] -add v12.4s, v12.4s, v23.4s -mul v0.4S, v0.4S,v17.s[1] -sub v23.4s, v18.4s, v3.4s -mul v21.4S, v21.4S,v17.s[0] -add v18.4s, v18.4s, v3.4s -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v14.4s, v22.4s -mla v21.4S, v26.4S, v31.s[0] -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v15.4S, v9.s[3] -nop -mul v15.4S, v15.4S,v30.s[3] -nop -sqrdmulh v26.4S, v16.4S, v9.s[2] -sub v3.4s, v10.4s, v8.4s -mul v16.4S, v16.4S,v30.s[2] -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v9.s[1] -sub v24.4s, v2.4s, v20.4s -mul v19.4S, v19.4S,v30.s[1] -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v9.s[0] -sub v13.4s, v6.4s, v0.4s -mul v12.4S, v12.4S,v30.s[0] -add v6.4s, v6.4s, v0.4s -mla v15.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v21.4s -sqrdmulh v0.4S, v23.4S, v7.s[3] -add v25.4s, v25.4s, v21.4s -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v3.4s, v15.4s -sqrdmulh v21.4S, v18.4S, v7.s[2] -add v3.4s, v3.4s, v15.4s -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v16.4s -sqrdmulh v15.4S, v27.4S, v7.s[1] -add v10.4s, v10.4s, v16.4s -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v19.4s -sqrdmulh v16.4S, v14.4S, v7.s[0] -add v24.4s, v24.4s, v19.4s -mul v23.4S, v23.4S,v1.s[3] -sub v19.4s, v2.4s, v12.4s -mul v18.4S, v18.4S,v1.s[2] -add v2.4s, v2.4s, v12.4s -mla v23.4S, v0.4S, v31.s[0] -str q26, [x0, #1008] -mla v18.4S, v21.4S, v31.s[0] -str q3, [x0, #944] -mul v27.4S, v27.4S,v1.s[1] -str q8, [x0, #880] -mul v14.4S, v14.4S,v1.s[0] -str q10, [x0, #816] -mla v27.4S, v15.4S, v31.s[0] -str q20, [x0, #752] -mla v14.4S, v16.4S, v31.s[0] -str q24, [x0, #688] -ldr q24, [x0, #960] -sqrdmulh v16.4S, v24.4S, v28.s[0] -str q19, [x0, #624] -mul v24.4S, v24.4S,v29.s[0] -str q2, [x0, #560] -ldr q2, [x0, #896] -sqrdmulh v19.4S, v2.4S, v28.s[0] -sub v20.4s, v13.4s, v23.4s -str q20, [x0, #496] -mul v2.4S, v2.4S,v29.s[0] -add v13.4s, v13.4s, v23.4s -ldr q23, [x0, #832] -sqrdmulh v20.4S, v23.4S, v28.s[0] -sub v15.4s, v6.4s, v18.4s -str q13, [x0, #432] -mul v23.4S, v23.4S,v29.s[0] -add v6.4s, v6.4s, v18.4s -ldr q18, [x0, #768] -sqrdmulh v13.4S, v18.4S, v28.s[0] -sub v10.4s, v22.4s, v27.4s -str q15, [x0, #368] -mul v18.4S, v18.4S,v29.s[0] -add v22.4s, v22.4s, v27.4s -ldr q27, [x0, #704] -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v14.4s -str q6, [x0, #304] -sqrdmulh v6.4S, v27.4S, v28.s[0] -add v25.4s, v25.4s, v14.4s -ldr q14, [x0, #640] -mla v2.4S, v19.4S, v31.s[0] -str q10, [x0, #240] -sqrdmulh v10.4S, v14.4S, v28.s[0] -nop -ldr q19, [x0, #576] -mla v23.4S, v20.4S, v31.s[0] -str q22, [x0, #176] -sqrdmulh v22.4S, v19.4S, v28.s[0] -nop -ldr q20, [x0, #512] -mla v18.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v20.4S, v28.s[0] -str q16, [x0, #112] -ldr q16, [x0, #448] -ldr q15, [x0, #384] -mul v27.4S, v27.4S,v29.s[0] -sub v8.4s, v16.4s, v24.4s -str q25, [x0, #48] -mul v14.4S, v14.4S,v29.s[0] -add v16.4s, v16.4s, v24.4s -ldr q24, [x0, #320] -ldr q25, [x0, #256] -mla v27.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v2.4s -mla v14.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -ldr q2, [x0, #192] -ldr q10, [x0, #128] -mul v19.4S, v19.4S,v29.s[0] -sub v3.4s, v24.4s, v23.4s -mul v20.4S, v20.4S,v29.s[0] -add v24.4s, v24.4s, v23.4s -ldr q23, [x0, #64] -ldr q21, [x0, #0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v18.4s -mla v20.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v28.s[2] -nop -mul v8.4S, v8.4S,v29.s[2] -nop -sqrdmulh v13.4S, v6.4S, v28.s[2] -sub v26.4s, v2.4s, v27.4s -mul v6.4S, v6.4S,v29.s[2] -add v2.4s, v2.4s, v27.4s -sqrdmulh v27.4S, v16.4S, v28.s[1] -sub v0.4s, v10.4s, v14.4s -mul v16.4S, v16.4S,v29.s[1] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v28.s[1] -sub v12.4s, v23.4s, v19.4s -mul v15.4S, v15.4S,v29.s[1] -add v23.4s, v23.4s, v19.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v20.4s -sqrdmulh v19.4S, v3.4S, v28.s[2] -add v21.4s, v21.4s, v20.4s -mla v6.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v22.4S, v28.s[2] -nop -mla v16.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v24.4S, v28.s[1] -nop -mla v15.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v25.4S, v28.s[1] -nop -mul v3.4S, v3.4S,v29.s[2] -sub v20.4s, v26.4s, v8.4s -mul v22.4S, v22.4S,v29.s[2] -add v26.4s, v26.4s, v8.4s -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v6.4s -mla v22.4S, v13.4S, v31.s[0] -add v0.4s, v0.4s, v6.4s -mul v24.4S, v24.4S,v29.s[1] -sub v6.4s, v2.4s, v16.4s -mul v25.4S, v25.4S,v29.s[1] -add v2.4s, v2.4s, v16.4s -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v10.4s, v15.4s -mla v25.4S, v14.4S, v31.s[0] -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v11.s[3] -nop -mul v20.4S, v20.4S,v17.s[3] -nop -sqrdmulh v14.4S, v26.4S, v11.s[2] -sub v16.4s, v12.4s, v3.4s -mul v26.4S, v26.4S,v17.s[2] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v6.4S, v11.s[1] -sub v13.4s, v18.4s, v22.4s -mul v6.4S, v6.4S,v17.s[1] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v11.s[0] -sub v8.4s, v23.4s, v24.4s -mul v2.4S, v2.4S,v17.s[0] -add v23.4s, v23.4s, v24.4s -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v21.4s, v25.4s -sqrdmulh v24.4S, v19.4S, v11.s[3] -add v21.4s, v21.4s, v25.4s -mla v26.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v0.4S, v11.s[2] -nop -mla v6.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v27.4S, v11.s[1] -nop -mla v2.4S, v22.4S, v31.s[0] -nop -sqrdmulh v22.4S, v10.4S, v11.s[0] -nop -mul v19.4S, v19.4S,v17.s[3] -sub v25.4s, v16.4s, v20.4s -mul v0.4S, v0.4S,v17.s[2] -add v16.4s, v16.4s, v20.4s -mla v19.4S, v24.4S, v31.s[0] -sub v24.4s, v12.4s, v26.4s -mla v0.4S, v14.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -mul v27.4S, v27.4S,v17.s[1] -sub v26.4s, v8.4s, v6.4s -mul v10.4S, v10.4S,v17.s[0] -add v8.4s, v8.4s, v6.4s -mla v27.4S, v3.4S, v31.s[0] -sub v3.4s, v23.4s, v2.4s -mla v10.4S, v22.4S, v31.s[0] -add v23.4s, v23.4s, v2.4s -sqrdmulh v2.4S, v25.4S, v9.s[3] -nop -mul v25.4S, v25.4S,v30.s[3] -nop -sqrdmulh v22.4S, v16.4S, v9.s[2] -sub v6.4s, v13.4s, v19.4s -mul v16.4S, v16.4S,v30.s[2] -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v9.s[1] -sub v14.4s, v18.4s, v0.4s -mul v24.4S, v24.4S,v30.s[1] -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v9.s[0] -sub v20.4s, v15.4s, v27.4s -mul v12.4S, v12.4S,v30.s[0] -add v15.4s, v15.4s, v27.4s -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v10.4s -sqrdmulh v27.4S, v26.4S, v7.s[3] -add v21.4s, v21.4s, v10.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v6.4s, v25.4s -sqrdmulh v10.4S, v8.4S, v7.s[2] -add v6.4s, v6.4s, v25.4s -mla v24.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v16.4s -sqrdmulh v25.4S, v3.4S, v7.s[1] -add v13.4s, v13.4s, v16.4s -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v24.4s -sqrdmulh v16.4S, v23.4S, v7.s[0] -add v14.4s, v14.4s, v24.4s -mul v26.4S, v26.4S,v1.s[3] -sub v24.4s, v18.4s, v12.4s -mul v8.4S, v8.4S,v1.s[2] -add v18.4s, v18.4s, v12.4s -mla v26.4S, v27.4S, v31.s[0] -str q22, [x0, #960] -mla v8.4S, v10.4S, v31.s[0] -str q6, [x0, #896] -mul v3.4S, v3.4S,v1.s[1] -str q19, [x0, #832] -mul v23.4S, v23.4S,v1.s[0] -str q13, [x0, #768] -mla v3.4S, v25.4S, v31.s[0] -str q0, [x0, #704] -mla v23.4S, v16.4S, v31.s[0] -str q14, [x0, #640] -ldr q14, [x0, #976] -sqrdmulh v16.4S, v14.4S, v28.s[0] -str q24, [x0, #576] -mul v14.4S, v14.4S,v29.s[0] -str q18, [x0, #512] -ldr q18, [x0, #912] -sqrdmulh v24.4S, v18.4S, v28.s[0] -sub v0.4s, v20.4s, v26.4s -str q0, [x0, #448] -mul v18.4S, v18.4S,v29.s[0] -add v20.4s, v20.4s, v26.4s -ldr q26, [x0, #848] -sqrdmulh v0.4S, v26.4S, v28.s[0] -sub v25.4s, v15.4s, v8.4s -str q20, [x0, #384] -mul v26.4S, v26.4S,v29.s[0] -add v15.4s, v15.4s, v8.4s -ldr q8, [x0, #784] -sqrdmulh v20.4S, v8.4S, v28.s[0] -sub v13.4s, v2.4s, v3.4s -str q25, [x0, #320] -mul v8.4S, v8.4S,v29.s[0] -add v2.4s, v2.4s, v3.4s -ldr q3, [x0, #720] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v23.4s -str q15, [x0, #256] -sqrdmulh v15.4S, v3.4S, v28.s[0] -add v21.4s, v21.4s, v23.4s -ldr q23, [x0, #656] -mla v18.4S, v24.4S, v31.s[0] -str q13, [x0, #192] -sqrdmulh v13.4S, v23.4S, v28.s[0] -nop -ldr q24, [x0, #592] -mla v26.4S, v0.4S, v31.s[0] -str q2, [x0, #128] -sqrdmulh v2.4S, v24.4S, v28.s[0] -nop -ldr q0, [x0, #528] -mla v8.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v0.4S, v28.s[0] -str q16, [x0, #64] -ldr q16, [x0, #464] -ldr q25, [x0, #400] -mul v3.4S, v3.4S,v29.s[0] -sub v19.4s, v16.4s, v14.4s -str q21, [x0, #0] -mul v23.4S, v23.4S,v29.s[0] -add v16.4s, v16.4s, v14.4s -ldr q14, [x0, #336] -ldr q21, [x0, #272] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v25.4s, v18.4s -mla v23.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v18.4s -ldr q18, [x0, #208] -ldr q13, [x0, #144] -mul v24.4S, v24.4S,v29.s[0] -sub v6.4s, v14.4s, v26.4s -mul v0.4S, v0.4S,v29.s[0] -add v14.4s, v14.4s, v26.4s -ldr q26, [x0, #80] -ldr q10, [x0, #16] -mla v24.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v8.4s -mla v0.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v28.s[2] -nop -mul v19.4S, v19.4S,v29.s[2] -nop -sqrdmulh v20.4S, v15.4S, v28.s[2] -sub v22.4s, v18.4s, v3.4s -mul v15.4S, v15.4S,v29.s[2] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v28.s[1] -sub v27.4s, v13.4s, v23.4s -mul v16.4S, v16.4S,v29.s[1] -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v25.4S, v28.s[1] -sub v12.4s, v26.4s, v24.4s -mul v25.4S, v25.4S,v29.s[1] -add v26.4s, v26.4s, v24.4s -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v0.4s -sqrdmulh v24.4S, v6.4S, v28.s[2] -add v10.4s, v10.4s, v0.4s -mla v15.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v2.4S, v28.s[2] -nop -mla v16.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v14.4S, v28.s[1] -nop -mla v25.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v21.4S, v28.s[1] -nop -mul v6.4S, v6.4S,v29.s[2] -sub v0.4s, v22.4s, v19.4s -mul v2.4S, v2.4S,v29.s[2] -add v22.4s, v22.4s, v19.4s -mla v6.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v15.4s -mla v2.4S, v20.4S, v31.s[0] -add v27.4s, v27.4s, v15.4s -mul v14.4S, v14.4S,v29.s[1] -sub v15.4s, v18.4s, v16.4s -mul v21.4S, v21.4S,v29.s[1] -add v18.4s, v18.4s, v16.4s -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v25.4s -mla v21.4S, v23.4S, v31.s[0] -add v13.4s, v13.4s, v25.4s -sqrdmulh v28.4S, v0.4S, v11.s[3] -nop -mul v0.4S, v0.4S,v17.s[3] -nop -sqrdmulh v29.4S, v22.4S, v11.s[2] -sub v25.4s, v12.4s, v6.4s -mul v22.4S, v22.4S,v17.s[2] -add v12.4s, v12.4s, v6.4s -sqrdmulh v6.4S, v15.4S, v11.s[1] -sub v23.4s, v8.4s, v2.4s -mul v15.4S, v15.4S,v17.s[1] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v18.4S, v11.s[0] -sub v16.4s, v26.4s, v14.4s -mul v18.4S, v18.4S,v17.s[0] -add v26.4s, v26.4s, v14.4s -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v21.4s -sqrdmulh v14.4S, v24.4S, v11.s[3] -add v10.4s, v10.4s, v21.4s -mla v22.4S, v29.4S, v31.s[0] -nop -sqrdmulh v29.4S, v27.4S, v11.s[2] -nop -mla v15.4S, v6.4S, v31.s[0] -nop -sqrdmulh v6.4S, v3.4S, v11.s[1] -nop -mla v18.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v13.4S, v11.s[0] -nop -mul v24.4S, v24.4S,v17.s[3] -sub v21.4s, v25.4s, v0.4s -mul v27.4S, v27.4S,v17.s[2] -add v25.4s, v25.4s, v0.4s -mla v24.4S, v14.4S, v31.s[0] -sub v14.4s, v12.4s, v22.4s -mla v27.4S, v29.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -mul v3.4S, v3.4S,v17.s[1] -sub v22.4s, v16.4s, v15.4s -mul v13.4S, v13.4S,v17.s[0] -add v16.4s, v16.4s, v15.4s -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v18.4s -mla v13.4S, v2.4S, v31.s[0] -add v26.4s, v26.4s, v18.4s -sqrdmulh v11.4S, v21.4S, v9.s[3] -nop -mul v21.4S, v21.4S,v30.s[3] -nop -sqrdmulh v17.4S, v25.4S, v9.s[2] -sub v18.4s, v23.4s, v24.4s -mul v25.4S, v25.4S,v30.s[2] -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v14.4S, v9.s[1] -sub v2.4s, v8.4s, v27.4s -mul v14.4S, v14.4S,v30.s[1] -add v8.4s, v8.4s, v27.4s -sqrdmulh v27.4S, v12.4S, v9.s[0] -sub v15.4s, v28.4s, v3.4s -mul v12.4S, v12.4S,v30.s[0] -add v28.4s, v28.4s, v3.4s -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v13.4s -sqrdmulh v9.4S, v22.4S, v7.s[3] -add v10.4s, v10.4s, v13.4s -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v21.4s -sqrdmulh v13.4S, v16.4S, v7.s[2] -add v18.4s, v18.4s, v21.4s -mla v14.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v25.4s -sqrdmulh v21.4S, v6.4S, v7.s[1] -add v23.4s, v23.4s, v25.4s -mla v12.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v14.4s -sqrdmulh v25.4S, v26.4S, v7.s[0] -add v2.4s, v2.4s, v14.4s -mul v22.4S, v22.4S,v1.s[3] -sub v14.4s, v8.4s, v12.4s -mul v16.4S, v16.4S,v1.s[2] -add v8.4s, v8.4s, v12.4s -mla v22.4S, v9.4S, v31.s[0] -str q17, [x0, #976] -mla v16.4S, v13.4S, v31.s[0] -str q18, [x0, #912] -mul v6.4S, v6.4S,v1.s[1] -str q24, [x0, #848] -mul v26.4S, v26.4S,v1.s[0] -str q23, [x0, #784] -mla v6.4S, v21.4S, v31.s[0] -str q27, [x0, #720] -mla v26.4S, v25.4S, v31.s[0] -str q2, [x0, #656] -str q14, [x0, #592] -str q8, [x0, #528] -sub v8.4s, v15.4s, v22.4s -str q8, [x0, #464] -add v15.4s, v15.4s, v22.4s -sub v22.4s, v28.4s, v16.4s -str q15, [x0, #400] -add v28.4s, v28.4s, v16.4s -sub v16.4s, v11.4s, v6.4s -str q22, [x0, #336] -add v11.4s, v11.4s, v6.4s -sub v6.4s, v10.4s, v26.4s -str q28, [x0, #272] -add v10.4s, v10.4s, v26.4s -str q16, [x0, #208] -str q11, [x0, #144] -str q6, [x0, #80] -str q10, [x0, #16] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q19, [x17, #+160] -ldr q20, [x17, #+176] -ldr q0, [x17, #+192] -ldr q29, [x17, #+208] -ldr q3, [x17, #+224] -ldr q30, [x17, #+240] -ldr q12, [x0, #32] -ldr q9, [x0, #48] -ldr q17, [x0, #0] -ldr q13, [x0, #16] -sqrdmulh v18.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v4.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v9.4s -add v13.4s, v13.4s, v9.4s -sqrdmulh v9.4S, v13.4S, v5.s[1] -mul v13.4S, v13.4S,v4.s[1] -mla v13.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -trn1 v12.4S, v17.4S, v9.4S -trn2 v24.4S, v17.4S, v9.4S -trn1 v23.4S, v18.4S, v13.4S -trn2 v21.4S, v18.4S, v13.4S -trn2 v18.2D, v12.2D, v23.2D -trn2 v13.2D, v24.2D, v21.2D -trn1 v17.2D, v12.2D, v23.2D -trn1 v9.2D, v24.2D, v21.2D -sqrdmulh v21.4S, v18.4S, v20.4S -mul v18.4S, v18.4S,v19.4S -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v18.4s -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v20.4S -mul v13.4S, v13.4S,v19.4S -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v0.4S -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v18.4S, v30.4S -mul v18.4S, v18.4S,v3.4S -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -str q17, [x0, #0] -str q13, [x0, #16] -str q21, [x0, #32] -str q9, [x0, #48] -ldr q9, [x17, #+256] -ldr q21, [x17, #+272] -ldr q13, [x17, #+288] -ldr q17, [x17, #+304] -ldr q18, [x17, #+320] -ldr q24, [x17, #+336] -ldr q23, [x17, #+352] -ldr q12, [x17, #+368] -ldr q30, [x0, #96] -ldr q3, [x0, #112] -ldr q29, [x0, #64] -ldr q0, [x0, #80] -sqrdmulh v20.4S, v30.4S, v21.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v3.4S, v21.s[0] -mul v3.4S, v3.4S,v9.s[0] -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v21.s[1] -mul v0.4S, v0.4S,v9.s[1] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v30.4S, v21.s[2] -mul v30.4S, v30.4S,v9.s[2] -mla v30.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v30.4s -add v20.4s, v20.4s, v30.4s -trn1 v30.4S, v29.4S, v3.4S -trn2 v19.4S, v29.4S, v3.4S -trn1 v5.4S, v20.4S, v0.4S -trn2 v4.4S, v20.4S, v0.4S -trn2 v20.2D, v30.2D, v5.2D -trn2 v0.2D, v19.2D, v4.2D -trn1 v29.2D, v30.2D, v5.2D -trn1 v3.2D, v19.2D, v4.2D -sqrdmulh v4.4S, v20.4S, v17.4S -mul v20.4S, v20.4S,v13.4S -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v20.4s -add v29.4s, v29.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v17.4S -mul v0.4S, v0.4S,v13.4S -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v0.4s -add v3.4s, v3.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v24.4S -mul v3.4S, v3.4S,v18.4S -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v12.4S -mul v20.4S, v20.4S,v23.4S -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -str q29, [x0, #64] -str q0, [x0, #80] -str q4, [x0, #96] -str q3, [x0, #112] -ldr q3, [x17, #+384] -ldr q4, [x17, #+400] -ldr q0, [x17, #+416] -ldr q29, [x17, #+432] -ldr q20, [x17, #+448] -ldr q19, [x17, #+464] -ldr q5, [x17, #+480] -ldr q30, [x17, #+496] -ldr q12, [x0, #160] -ldr q23, [x0, #176] -ldr q24, [x0, #128] -ldr q18, [x0, #144] -sqrdmulh v17.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v3.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v23.4S, v4.s[0] -mul v23.4S, v23.4S,v3.s[0] -mla v23.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -sqrdmulh v23.4S, v18.4S, v4.s[1] -mul v18.4S, v18.4S,v3.s[1] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v24.4s, v18.4s -add v24.4s, v24.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v4.s[2] -mul v12.4S, v12.4S,v3.s[2] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -trn1 v12.4S, v24.4S, v23.4S -trn2 v13.4S, v24.4S, v23.4S -trn1 v21.4S, v17.4S, v18.4S -trn2 v9.4S, v17.4S, v18.4S -trn2 v17.2D, v12.2D, v21.2D -trn2 v18.2D, v13.2D, v9.2D -trn1 v24.2D, v12.2D, v21.2D -trn1 v23.2D, v13.2D, v9.2D -sqrdmulh v9.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v0.4S -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v24.4s, v17.4s -add v24.4s, v24.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v29.4S -mul v18.4S, v18.4S,v0.4S -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v23.4s, v18.4s -add v23.4s, v23.4s, v18.4s -sqrdmulh v18.4S, v23.4S, v19.4S -mul v23.4S, v23.4S,v20.4S -mla v23.4S, v18.4S, v31.s[0] -sub v18.4s, v24.4s, v23.4s -add v24.4s, v24.4s, v23.4s -sqrdmulh v23.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v5.4S -mla v17.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -str q24, [x0, #128] -str q18, [x0, #144] -str q9, [x0, #160] -str q23, [x0, #176] -ldr q23, [x17, #+512] -ldr q9, [x17, #+528] -ldr q18, [x17, #+544] -ldr q24, [x17, #+560] -ldr q17, [x17, #+576] -ldr q13, [x17, #+592] -ldr q21, [x17, #+608] -ldr q12, [x17, #+624] -ldr q30, [x0, #224] -ldr q5, [x0, #240] -ldr q19, [x0, #192] -ldr q20, [x0, #208] -sqrdmulh v29.4S, v30.4S, v9.s[0] -mul v30.4S, v30.4S,v23.s[0] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v23.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v9.s[1] -mul v20.4S, v20.4S,v23.s[1] -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v23.s[2] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -trn1 v30.4S, v19.4S, v5.4S -trn2 v0.4S, v19.4S, v5.4S -trn1 v4.4S, v29.4S, v20.4S -trn2 v3.4S, v29.4S, v20.4S -trn2 v29.2D, v30.2D, v4.2D -trn2 v20.2D, v0.2D, v3.2D -trn1 v19.2D, v30.2D, v4.2D -trn1 v5.2D, v0.2D, v3.2D -sqrdmulh v3.4S, v29.4S, v24.4S -mul v29.4S, v29.4S,v18.4S -mla v29.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -sqrdmulh v29.4S, v20.4S, v24.4S -mul v20.4S, v20.4S,v18.4S -mla v20.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v5.4S, v13.4S -mul v5.4S, v5.4S,v17.4S -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v21.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v29.4s -add v3.4s, v3.4s, v29.4s -str q19, [x0, #192] -str q20, [x0, #208] -str q3, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q3, [x17, #+656] -ldr q20, [x17, #+672] -ldr q19, [x17, #+688] -ldr q29, [x17, #+704] -ldr q0, [x17, #+720] -ldr q4, [x17, #+736] -ldr q30, [x17, #+752] -ldr q12, [x0, #288] -ldr q21, [x0, #304] -ldr q13, [x0, #256] -ldr q17, [x0, #272] -sqrdmulh v24.4S, v12.4S, v3.s[0] -mul v12.4S, v12.4S,v5.s[0] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v3.s[0] -mul v21.4S, v21.4S,v5.s[0] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v3.s[1] -mul v17.4S, v17.4S,v5.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v3.s[2] -mul v12.4S, v12.4S,v5.s[2] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -trn1 v12.4S, v13.4S, v21.4S -trn2 v18.4S, v13.4S, v21.4S -trn1 v9.4S, v24.4S, v17.4S -trn2 v23.4S, v24.4S, v17.4S -trn2 v24.2D, v12.2D, v9.2D -trn2 v17.2D, v18.2D, v23.2D -trn1 v13.2D, v12.2D, v9.2D -trn1 v21.2D, v18.2D, v23.2D -sqrdmulh v23.4S, v24.4S, v19.4S -mul v24.4S, v24.4S,v20.4S -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -sqrdmulh v24.4S, v17.4S, v19.4S -mul v17.4S, v17.4S,v20.4S -mla v17.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v0.4S -mul v21.4S, v21.4S,v29.4S -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v24.4S, v30.4S -mul v24.4S, v24.4S,v4.4S -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -str q13, [x0, #256] -str q17, [x0, #272] -str q23, [x0, #288] -str q21, [x0, #304] -ldr q21, [x17, #+768] -ldr q23, [x17, #+784] -ldr q17, [x17, #+800] -ldr q13, [x17, #+816] -ldr q24, [x17, #+832] -ldr q18, [x17, #+848] -ldr q9, [x17, #+864] -ldr q12, [x17, #+880] -ldr q30, [x0, #352] -ldr q4, [x0, #368] -ldr q0, [x0, #320] -ldr q29, [x0, #336] -sqrdmulh v19.4S, v30.4S, v23.s[0] -mul v30.4S, v30.4S,v21.s[0] -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v23.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v23.s[1] -mul v29.4S, v29.4S,v21.s[1] -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v29.4s -add v0.4s, v0.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v23.s[2] -mul v30.4S, v30.4S,v21.s[2] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -trn1 v30.4S, v0.4S, v4.4S -trn2 v20.4S, v0.4S, v4.4S -trn1 v3.4S, v19.4S, v29.4S -trn2 v5.4S, v19.4S, v29.4S -trn2 v19.2D, v30.2D, v3.2D -trn2 v29.2D, v20.2D, v5.2D -trn1 v0.2D, v30.2D, v3.2D -trn1 v4.2D, v20.2D, v5.2D -sqrdmulh v5.4S, v19.4S, v13.4S -mul v19.4S, v19.4S,v17.4S -mla v19.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v13.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v29.4s -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v18.4S -mul v4.4S, v4.4S,v24.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v19.4S, v12.4S -mul v19.4S, v19.4S,v9.4S -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -str q0, [x0, #320] -str q29, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q29, [x17, #+928] -ldr q0, [x17, #+944] -ldr q19, [x17, #+960] -ldr q20, [x17, #+976] -ldr q3, [x17, #+992] -ldr q30, [x17, #+1008] -ldr q12, [x0, #416] -ldr q9, [x0, #432] -ldr q18, [x0, #384] -ldr q24, [x0, #400] -sqrdmulh v13.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v4.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v24.4s, v9.4s -add v24.4s, v24.4s, v9.4s -sqrdmulh v9.4S, v24.4S, v5.s[1] -mul v24.4S, v24.4S,v4.s[1] -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v24.4s -add v18.4s, v18.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -trn1 v12.4S, v18.4S, v9.4S -trn2 v17.4S, v18.4S, v9.4S -trn1 v23.4S, v13.4S, v24.4S -trn2 v21.4S, v13.4S, v24.4S -trn2 v13.2D, v12.2D, v23.2D -trn2 v24.2D, v17.2D, v21.2D -trn1 v18.2D, v12.2D, v23.2D -trn1 v9.2D, v17.2D, v21.2D -sqrdmulh v21.4S, v13.4S, v0.4S -mul v13.4S, v13.4S,v29.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -sqrdmulh v13.4S, v24.4S, v0.4S -mul v24.4S, v24.4S,v29.4S -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v24.4s -add v9.4s, v9.4s, v24.4s -sqrdmulh v24.4S, v9.4S, v20.4S -mul v9.4S, v9.4S,v19.4S -mla v9.4S, v24.4S, v31.s[0] -sub v24.4s, v18.4s, v9.4s -add v18.4s, v18.4s, v9.4s -sqrdmulh v9.4S, v13.4S, v30.4S -mul v13.4S, v13.4S,v3.4S -mla v13.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -str q18, [x0, #384] -str q24, [x0, #400] -str q21, [x0, #416] -str q9, [x0, #432] -ldr q9, [x17, #+1024] -ldr q21, [x17, #+1040] -ldr q24, [x17, #+1056] -ldr q18, [x17, #+1072] -ldr q13, [x17, #+1088] -ldr q17, [x17, #+1104] -ldr q23, [x17, #+1120] -ldr q12, [x17, #+1136] -ldr q30, [x0, #480] -ldr q3, [x0, #496] -ldr q20, [x0, #448] -ldr q19, [x0, #464] -sqrdmulh v0.4S, v30.4S, v21.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v30.4s -add v20.4s, v20.4s, v30.4s -sqrdmulh v30.4S, v3.4S, v21.s[0] -mul v3.4S, v3.4S,v9.s[0] -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v21.s[1] -mul v19.4S, v19.4S,v9.s[1] -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v21.s[2] -mul v30.4S, v30.4S,v9.s[2] -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -trn1 v30.4S, v20.4S, v3.4S -trn2 v29.4S, v20.4S, v3.4S -trn1 v5.4S, v0.4S, v19.4S -trn2 v4.4S, v0.4S, v19.4S -trn2 v0.2D, v30.2D, v5.2D -trn2 v19.2D, v29.2D, v4.2D -trn1 v20.2D, v30.2D, v5.2D -trn1 v3.2D, v29.2D, v4.2D -sqrdmulh v4.4S, v0.4S, v18.4S -mul v0.4S, v0.4S,v24.4S -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v18.4S -mul v19.4S, v19.4S,v24.4S -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v17.4S -mul v3.4S, v3.4S,v13.4S -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v3.4s -add v20.4s, v20.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v12.4S -mul v0.4S, v0.4S,v23.4S -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v0.4s -add v4.4s, v4.4s, v0.4s -str q20, [x0, #448] -str q19, [x0, #464] -str q4, [x0, #480] -str q3, [x0, #496] -ldr q3, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q19, [x17, #+1184] -ldr q20, [x17, #+1200] -ldr q0, [x17, #+1216] -ldr q29, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q30, [x17, #+1264] -ldr q12, [x0, #544] -ldr q23, [x0, #560] -ldr q17, [x0, #512] -ldr q13, [x0, #528] -sqrdmulh v18.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v3.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v23.4S, v4.s[0] -mul v23.4S, v23.4S,v3.s[0] -mla v23.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v23.4s -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v13.4S, v4.s[1] -mul v13.4S, v13.4S,v3.s[1] -mla v13.4S, v23.4S, v31.s[0] -sub v23.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v4.s[2] -mul v12.4S, v12.4S,v3.s[2] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -trn1 v12.4S, v17.4S, v23.4S -trn2 v24.4S, v17.4S, v23.4S -trn1 v21.4S, v18.4S, v13.4S -trn2 v9.4S, v18.4S, v13.4S -trn2 v18.2D, v12.2D, v21.2D -trn2 v13.2D, v24.2D, v9.2D -trn1 v17.2D, v12.2D, v21.2D -trn1 v23.2D, v24.2D, v9.2D -sqrdmulh v9.4S, v18.4S, v20.4S -mul v18.4S, v18.4S,v19.4S -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v18.4s -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v20.4S -mul v13.4S, v13.4S,v19.4S -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v23.4s, v13.4s -add v23.4s, v23.4s, v13.4s -sqrdmulh v13.4S, v23.4S, v29.4S -mul v23.4S, v23.4S,v0.4S -mla v23.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v23.4s -add v17.4s, v17.4s, v23.4s -sqrdmulh v23.4S, v18.4S, v30.4S -mul v18.4S, v18.4S,v5.4S -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v18.4s -add v9.4s, v9.4s, v18.4s -str q17, [x0, #512] -str q13, [x0, #528] -str q9, [x0, #544] -str q23, [x0, #560] -ldr q23, [x17, #+1280] -ldr q9, [x17, #+1296] -ldr q13, [x17, #+1312] -ldr q17, [x17, #+1328] -ldr q18, [x17, #+1344] -ldr q24, [x17, #+1360] -ldr q21, [x17, #+1376] -ldr q12, [x17, #+1392] -ldr q30, [x0, #608] -ldr q5, [x0, #624] -ldr q29, [x0, #576] -ldr q0, [x0, #592] -sqrdmulh v20.4S, v30.4S, v9.s[0] -mul v30.4S, v30.4S,v23.s[0] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v23.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v0.4s, v5.4s -add v0.4s, v0.4s, v5.4s -sqrdmulh v5.4S, v0.4S, v9.s[1] -mul v0.4S, v0.4S,v23.s[1] -mla v0.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -sqrdmulh v0.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v23.s[2] -mla v30.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v30.4s -add v20.4s, v20.4s, v30.4s -trn1 v30.4S, v29.4S, v5.4S -trn2 v19.4S, v29.4S, v5.4S -trn1 v4.4S, v20.4S, v0.4S -trn2 v3.4S, v20.4S, v0.4S -trn2 v20.2D, v30.2D, v4.2D -trn2 v0.2D, v19.2D, v3.2D -trn1 v29.2D, v30.2D, v4.2D -trn1 v5.2D, v19.2D, v3.2D -sqrdmulh v3.4S, v20.4S, v17.4S -mul v20.4S, v20.4S,v13.4S -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v29.4s, v20.4s -add v29.4s, v29.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v17.4S -mul v0.4S, v0.4S,v13.4S -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v0.4s -add v5.4s, v5.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v24.4S -mul v5.4S, v5.4S,v18.4S -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v5.4s -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v12.4S -mul v20.4S, v20.4S,v21.4S -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -str q29, [x0, #576] -str q0, [x0, #592] -str q3, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q3, [x17, #+1424] -ldr q0, [x17, #+1440] -ldr q29, [x17, #+1456] -ldr q20, [x17, #+1472] -ldr q19, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q30, [x17, #+1520] -ldr q12, [x0, #672] -ldr q21, [x0, #688] -ldr q24, [x0, #640] -ldr q18, [x0, #656] -sqrdmulh v17.4S, v12.4S, v3.s[0] -mul v12.4S, v12.4S,v5.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v3.s[0] -mul v21.4S, v21.4S,v5.s[0] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v3.s[1] -mul v18.4S, v18.4S,v5.s[1] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v18.4s -add v24.4s, v24.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v3.s[2] -mul v12.4S, v12.4S,v5.s[2] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -trn1 v12.4S, v24.4S, v21.4S -trn2 v13.4S, v24.4S, v21.4S -trn1 v9.4S, v17.4S, v18.4S -trn2 v23.4S, v17.4S, v18.4S -trn2 v17.2D, v12.2D, v9.2D -trn2 v18.2D, v13.2D, v23.2D -trn1 v24.2D, v12.2D, v9.2D -trn1 v21.2D, v13.2D, v23.2D -sqrdmulh v23.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v0.4S -mla v17.4S, v23.4S, v31.s[0] -sub v23.4s, v24.4s, v17.4s -add v24.4s, v24.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v29.4S -mul v18.4S, v18.4S,v0.4S -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v19.4S -mul v21.4S, v21.4S,v20.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v24.4s, v21.4s -add v24.4s, v24.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v4.4S -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v17.4s -add v23.4s, v23.4s, v17.4s -str q24, [x0, #640] -str q18, [x0, #656] -str q23, [x0, #672] -str q21, [x0, #688] -ldr q21, [x17, #+1536] -ldr q23, [x17, #+1552] -ldr q18, [x17, #+1568] -ldr q24, [x17, #+1584] -ldr q17, [x17, #+1600] -ldr q13, [x17, #+1616] -ldr q9, [x17, #+1632] -ldr q12, [x17, #+1648] -ldr q30, [x0, #736] -ldr q4, [x0, #752] -ldr q19, [x0, #704] -ldr q20, [x0, #720] -sqrdmulh v29.4S, v30.4S, v23.s[0] -mul v30.4S, v30.4S,v21.s[0] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v23.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v23.s[1] -mul v20.4S, v20.4S,v21.s[1] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v23.s[2] -mul v30.4S, v30.4S,v21.s[2] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -trn1 v30.4S, v19.4S, v4.4S -trn2 v0.4S, v19.4S, v4.4S -trn1 v3.4S, v29.4S, v20.4S -trn2 v5.4S, v29.4S, v20.4S -trn2 v29.2D, v30.2D, v3.2D -trn2 v20.2D, v0.2D, v5.2D -trn1 v19.2D, v30.2D, v3.2D -trn1 v4.2D, v0.2D, v5.2D -sqrdmulh v5.4S, v29.4S, v24.4S -mul v29.4S, v29.4S,v18.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -sqrdmulh v29.4S, v20.4S, v24.4S -mul v20.4S, v20.4S,v18.4S -mla v20.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v4.4S, v13.4S -mul v4.4S, v4.4S,v17.4S -mla v4.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v9.4S -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -str q19, [x0, #704] -str q20, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q20, [x17, #+1696] -ldr q19, [x17, #+1712] -ldr q29, [x17, #+1728] -ldr q0, [x17, #+1744] -ldr q3, [x17, #+1760] -ldr q30, [x17, #+1776] -ldr q12, [x0, #800] -ldr q9, [x0, #816] -ldr q13, [x0, #768] -ldr q17, [x0, #784] -sqrdmulh v24.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v4.s[0] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v5.s[1] -mul v17.4S, v17.4S,v4.s[1] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -trn1 v12.4S, v13.4S, v9.4S -trn2 v18.4S, v13.4S, v9.4S -trn1 v23.4S, v24.4S, v17.4S -trn2 v21.4S, v24.4S, v17.4S -trn2 v24.2D, v12.2D, v23.2D -trn2 v17.2D, v18.2D, v21.2D -trn1 v13.2D, v12.2D, v23.2D -trn1 v9.2D, v18.2D, v21.2D -sqrdmulh v21.4S, v24.4S, v19.4S -mul v24.4S, v24.4S,v20.4S -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -sqrdmulh v24.4S, v17.4S, v19.4S -mul v17.4S, v17.4S,v20.4S -mla v17.4S, v24.4S, v31.s[0] -sub v24.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v0.4S -mul v9.4S, v9.4S,v29.4S -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v13.4s, v9.4s -add v13.4s, v13.4s, v9.4s -sqrdmulh v9.4S, v24.4S, v30.4S -mul v24.4S, v24.4S,v3.4S -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -str q13, [x0, #768] -str q17, [x0, #784] -str q21, [x0, #800] -str q9, [x0, #816] -ldr q9, [x17, #+1792] -ldr q21, [x17, #+1808] -ldr q17, [x17, #+1824] -ldr q13, [x17, #+1840] -ldr q24, [x17, #+1856] -ldr q18, [x17, #+1872] -ldr q23, [x17, #+1888] -ldr q12, [x17, #+1904] -ldr q30, [x0, #864] -ldr q3, [x0, #880] -ldr q0, [x0, #832] -ldr q29, [x0, #848] -sqrdmulh v19.4S, v30.4S, v21.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -sqrdmulh v30.4S, v3.4S, v21.s[0] -mul v3.4S, v3.4S,v9.s[0] -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v29.4S, v21.s[1] -mul v29.4S, v29.4S,v9.s[1] -mla v29.4S, v3.4S, v31.s[0] -sub v3.4s, v0.4s, v29.4s -add v0.4s, v0.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v21.s[2] -mul v30.4S, v30.4S,v9.s[2] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -trn1 v30.4S, v0.4S, v3.4S -trn2 v20.4S, v0.4S, v3.4S -trn1 v5.4S, v19.4S, v29.4S -trn2 v4.4S, v19.4S, v29.4S -trn2 v19.2D, v30.2D, v5.2D -trn2 v29.2D, v20.2D, v4.2D -trn1 v0.2D, v30.2D, v5.2D -trn1 v3.2D, v20.2D, v4.2D -sqrdmulh v4.4S, v19.4S, v13.4S -mul v19.4S, v19.4S,v17.4S -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v13.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v29.4s -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v3.4S, v18.4S -mul v3.4S, v3.4S,v24.4S -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v12.4S -mul v19.4S, v19.4S,v23.4S -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v19.4s -add v4.4s, v4.4s, v19.4s -str q0, [x0, #832] -str q29, [x0, #848] -str q4, [x0, #864] -str q3, [x0, #880] -ldr q3, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q29, [x17, #+1952] -ldr q0, [x17, #+1968] -ldr q19, [x17, #+1984] -ldr q20, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q30, [x17, #+2032] -ldr q12, [x0, #928] -ldr q23, [x0, #944] -ldr q18, [x0, #896] -ldr q24, [x0, #912] -sqrdmulh v13.4S, v12.4S, v4.s[0] -mul v12.4S, v12.4S,v3.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v23.4S, v4.s[0] -mul v23.4S, v23.4S,v3.s[0] -mla v23.4S, v12.4S, v31.s[0] -sub v12.4s, v24.4s, v23.4s -add v24.4s, v24.4s, v23.4s -sqrdmulh v23.4S, v24.4S, v4.s[1] -mul v24.4S, v24.4S,v3.s[1] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v24.4s -add v18.4s, v18.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v4.s[2] -mul v12.4S, v12.4S,v3.s[2] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -trn1 v12.4S, v18.4S, v23.4S -trn2 v17.4S, v18.4S, v23.4S -trn1 v21.4S, v13.4S, v24.4S -trn2 v9.4S, v13.4S, v24.4S -trn2 v13.2D, v12.2D, v21.2D -trn2 v24.2D, v17.2D, v9.2D -trn1 v18.2D, v12.2D, v21.2D -trn1 v23.2D, v17.2D, v9.2D -sqrdmulh v9.4S, v13.4S, v0.4S -mul v13.4S, v13.4S,v29.4S -mla v13.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -sqrdmulh v13.4S, v24.4S, v0.4S -mul v24.4S, v24.4S,v29.4S -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v23.4S, v20.4S -mul v23.4S, v23.4S,v19.4S -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -sqrdmulh v23.4S, v13.4S, v30.4S -mul v13.4S, v13.4S,v5.4S -mla v13.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -str q18, [x0, #896] -str q24, [x0, #912] -str q9, [x0, #928] -str q23, [x0, #944] -ldr q23, [x17, #+2048] -ldr q9, [x17, #+2064] -ldr q24, [x17, #+2080] -ldr q18, [x17, #+2096] -ldr q13, [x17, #+2112] -ldr q17, [x17, #+2128] -ldr q21, [x17, #+2144] -ldr q12, [x17, #+2160] -ldr q30, [x0, #992] -ldr q5, [x0, #1008] -ldr q20, [x0, #960] -ldr q19, [x0, #976] -sqrdmulh v0.4S, v30.4S, v9.s[0] -mul v30.4S, v30.4S,v23.s[0] -mla v30.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v30.4s -add v20.4s, v20.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v23.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v19.4S, v9.s[1] -mul v19.4S, v19.4S,v23.s[1] -mla v19.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v23.s[2] -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -trn1 v30.4S, v20.4S, v5.4S -trn2 v29.4S, v20.4S, v5.4S -trn1 v4.4S, v0.4S, v19.4S -trn2 v3.4S, v0.4S, v19.4S -trn2 v0.2D, v30.2D, v4.2D -trn2 v19.2D, v29.2D, v3.2D -trn1 v20.2D, v30.2D, v4.2D -trn1 v5.2D, v29.2D, v3.2D -sqrdmulh v3.4S, v0.4S, v18.4S -mul v0.4S, v0.4S,v24.4S -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v18.4S -mul v19.4S, v19.4S,v24.4S -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -sqrdmulh v19.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v13.4S -mla v5.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v0.4S, v12.4S -mul v0.4S, v0.4S,v21.4S -mla v0.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v0.4s -add v3.4s, v3.4s, v0.4s -str q20, [x0, #960] -str q19, [x0, #976] -str q3, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2476 -// Instruction count: 2472 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s deleted file mode 100644 index eeedda8..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_17_0.s +++ /dev/null @@ -1,2486 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_17_0 -.global _ntt_u32_full_neon_asm_var_4_4_17_0 -ntt_u32_full_neon_asm_var_4_4_17_0: -_ntt_u32_full_neon_asm_var_4_4_17_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -mla v28.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v29.s[0] -ldr q19, [x0, #672] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v29.s[0] -ldr q18, [x0, #608] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v29.s[0] -ldr q17, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v29.s[0] -ldr q16, [x0, #480] -ldr q3, [x0, #416] -mul v20.4S, v20.4S,v30.s[0] -sub v2.4s, v16.4s, v28.4s -mul v19.4S, v19.4S,v30.s[0] -add v16.4s, v16.4s, v28.4s -ldr q28, [x0, #352] -ldr q1, [x0, #288] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q25, [x0, #160] -mul v18.4S, v18.4S,v30.s[0] -sub v0.4s, v28.4s, v24.4s -mul v17.4S, v17.4S,v30.s[0] -add v28.4s, v28.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v29.s[2] -nop -mul v2.4S, v2.4S,v30.s[2] -nop -sqrdmulh v21.4S, v27.4S, v29.s[2] -sub v14.4s, v26.4s, v20.4s -mul v27.4S, v27.4S,v30.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v29.s[1] -sub v13.4s, v25.4s, v19.4s -mul v16.4S, v16.4S,v30.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -sub v12.4s, v24.4s, v18.4s -mul v3.4S, v3.4S,v30.s[1] -add v24.4s, v24.4s, v18.4s -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v0.4S, v29.s[2] -add v15.4s, v15.4s, v17.4s -mla v27.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v23.4S, v29.s[2] -nop -mla v16.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v28.4S, v29.s[1] -nop -mla v3.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v1.4S, v29.s[1] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v0.4S, v0.4S,v30.s[2] -sub v10.4s, v14.4s, v2.4s -mul v23.4S, v23.4S,v30.s[2] -add v14.4s, v14.4s, v2.4s -mla v0.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v27.4s -mla v23.4S, v21.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -mul v28.4S, v28.4S,v30.s[1] -sub v27.4s, v26.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -add v26.4s, v26.4s, v16.4s -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v3.4s -mla v1.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v11.s[3] -nop -mul v10.4S, v10.4S,v17.s[3] -nop -sqrdmulh v19.4S, v14.4S, v11.s[2] -sub v16.4s, v12.4s, v0.4s -mul v14.4S, v14.4S,v17.s[2] -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v11.s[1] -sub v21.4s, v22.4s, v23.4s -mul v27.4S, v27.4S,v17.s[1] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v11.s[0] -sub v2.4s, v24.4s, v28.4s -mul v26.4S, v26.4S,v17.s[0] -add v24.4s, v24.4s, v28.4s -ldr q28, [x17, #+96] -ldr q9, [x17, #+112] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v1.4s -sqrdmulh v8.4S, v18.4S, v11.s[3] -add v15.4s, v15.4s, v1.4s -mla v14.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v13.4S, v11.s[2] -nop -mla v27.4S, v0.4S, v31.s[0] -nop -sqrdmulh v0.4S, v20.4S, v11.s[1] -nop -mla v26.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v25.4S, v11.s[0] -nop -ldr q1, [x17, #+64] -ldr q7, [x17, #+80] -mul v18.4S, v18.4S,v17.s[3] -sub v6.4s, v16.4s, v10.4s -mul v13.4S, v13.4S,v17.s[2] -add v16.4s, v16.4s, v10.4s -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v14.4s -mla v13.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v14.4s -mul v20.4S, v20.4S,v17.s[1] -sub v14.4s, v2.4s, v27.4s -mul v25.4S, v25.4S,v17.s[0] -add v2.4s, v2.4s, v27.4s -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v26.4s -mla v25.4S, v23.4S, v31.s[0] -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[3] -nop -mul v6.4S, v6.4S,v28.s[3] -nop -sqrdmulh v23.4S, v16.4S, v9.s[2] -sub v27.4s, v21.4s, v18.4s -mul v16.4S, v16.4S,v28.s[2] -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v9.s[1] -sub v19.4s, v22.4s, v13.4s -mul v8.4S, v8.4S,v28.s[1] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v9.s[0] -sub v10.4s, v3.4s, v20.4s -mul v12.4S, v12.4S,v28.s[0] -add v3.4s, v3.4s, v20.4s -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v25.4s -sqrdmulh v20.4S, v14.4S, v7.s[3] -add v15.4s, v15.4s, v25.4s -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v27.4s, v6.4s -sqrdmulh v25.4S, v2.4S, v7.s[2] -add v27.4s, v27.4s, v6.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v16.4s -sqrdmulh v6.4S, v0.4S, v7.s[1] -add v21.4s, v21.4s, v16.4s -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v19.4s, v8.4s -sqrdmulh v16.4S, v24.4S, v7.s[0] -add v19.4s, v19.4s, v8.4s -mul v14.4S, v14.4S,v1.s[3] -sub v8.4s, v22.4s, v12.4s -mul v2.4S, v2.4S,v1.s[2] -add v22.4s, v22.4s, v12.4s -mla v14.4S, v20.4S, v31.s[0] -str q23, [x0, #992] -mla v2.4S, v25.4S, v31.s[0] -str q27, [x0, #928] -mul v0.4S, v0.4S,v1.s[1] -str q18, [x0, #864] -mul v24.4S, v24.4S,v1.s[0] -str q21, [x0, #800] -mla v0.4S, v6.4S, v31.s[0] -str q13, [x0, #736] -mla v24.4S, v16.4S, v31.s[0] -str q19, [x0, #672] -ldr q19, [x0, #1008] -sqrdmulh v16.4S, v19.4S, v29.s[0] -str q8, [x0, #608] -mul v19.4S, v19.4S,v30.s[0] -sub v8.4s, v10.4s, v14.4s -ldr q13, [x0, #944] -sqrdmulh v6.4S, v13.4S, v29.s[0] -str q22, [x0, #544] -mul v13.4S, v13.4S,v30.s[0] -add v10.4s, v10.4s, v14.4s -ldr q14, [x0, #880] -sqrdmulh v22.4S, v14.4S, v29.s[0] -str q8, [x0, #480] -mul v14.4S, v14.4S,v30.s[0] -sub v8.4s, v3.4s, v2.4s -ldr q21, [x0, #816] -sqrdmulh v18.4S, v21.4S, v29.s[0] -str q10, [x0, #416] -mul v21.4S, v21.4S,v30.s[0] -add v3.4s, v3.4s, v2.4s -ldr q2, [x0, #752] -mla v19.4S, v16.4S, v31.s[0] -str q8, [x0, #352] -sqrdmulh v8.4S, v2.4S, v29.s[0] -sub v16.4s, v26.4s, v0.4s -ldr q10, [x0, #688] -mla v13.4S, v6.4S, v31.s[0] -str q3, [x0, #288] -sqrdmulh v3.4S, v10.4S, v29.s[0] -add v26.4s, v26.4s, v0.4s -ldr q0, [x0, #624] -mla v14.4S, v22.4S, v31.s[0] -str q16, [x0, #224] -sqrdmulh v16.4S, v0.4S, v29.s[0] -sub v22.4s, v15.4s, v24.4s -ldr q6, [x0, #560] -mla v21.4S, v18.4S, v31.s[0] -str q26, [x0, #160] -sqrdmulh v26.4S, v6.4S, v29.s[0] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #496] -ldr q18, [x0, #432] -mul v2.4S, v2.4S,v30.s[0] -sub v27.4s, v24.4s, v19.4s -mul v10.4S, v10.4S,v30.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #368] -ldr q25, [x0, #304] -mla v2.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v13.4s -mla v10.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v13.4s -ldr q13, [x0, #240] -ldr q3, [x0, #176] -mul v0.4S, v0.4S,v30.s[0] -sub v23.4s, v19.4s, v14.4s -mul v6.4S, v6.4S,v30.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #112] -ldr q20, [x0, #48] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v21.4s -mla v6.4S, v26.4S, v31.s[0] -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -nop -mul v27.4S, v27.4S,v30.s[2] -nop -sqrdmulh v26.4S, v8.4S, v29.s[2] -sub v12.4s, v13.4s, v2.4s -mul v8.4S, v8.4S,v30.s[2] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v29.s[1] -sub v5.4s, v3.4s, v10.4s -mul v24.4S, v24.4S,v30.s[1] -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v18.4S, v29.s[1] -sub v4.4s, v14.4s, v0.4s -mul v18.4S, v18.4S,v30.s[1] -add v14.4s, v14.4s, v0.4s -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v6.4s -sqrdmulh v0.4S, v23.4S, v29.s[2] -add v20.4s, v20.4s, v6.4s -mla v8.4S, v26.4S, v31.s[0] -str q22, [x0, #96] -sqrdmulh v22.4S, v16.4S, v29.s[2] -nop -mla v24.4S, v2.4S, v31.s[0] -str q15, [x0, #32] -sqrdmulh v15.4S, v19.4S, v29.s[1] -nop -mla v18.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v25.4S, v29.s[1] -nop -mul v23.4S, v23.4S,v30.s[2] -sub v2.4s, v12.4s, v27.4s -mul v16.4S, v16.4S,v30.s[2] -add v12.4s, v12.4s, v27.4s -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v8.4s -mla v16.4S, v22.4S, v31.s[0] -add v5.4s, v5.4s, v8.4s -mul v19.4S, v19.4S,v30.s[1] -sub v8.4s, v13.4s, v24.4s -mul v25.4S, v25.4S,v30.s[1] -add v13.4s, v13.4s, v24.4s -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v3.4s, v18.4s -mla v25.4S, v10.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v11.s[3] -nop -mul v2.4S, v2.4S,v17.s[3] -nop -sqrdmulh v10.4S, v12.4S, v11.s[2] -sub v24.4s, v4.4s, v23.4s -mul v12.4S, v12.4S,v17.s[2] -add v4.4s, v4.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v11.s[1] -sub v22.4s, v21.4s, v16.4s -mul v8.4S, v8.4S,v17.s[1] -add v21.4s, v21.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v11.s[0] -sub v27.4s, v14.4s, v19.4s -mul v13.4S, v13.4S,v17.s[0] -add v14.4s, v14.4s, v19.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v25.4s -sqrdmulh v19.4S, v0.4S, v11.s[3] -add v20.4s, v20.4s, v25.4s -mla v12.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v5.4S, v11.s[2] -nop -mla v8.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v15.4S, v11.s[1] -nop -mla v13.4S, v16.4S, v31.s[0] -nop -sqrdmulh v16.4S, v3.4S, v11.s[0] -nop -mul v0.4S, v0.4S,v17.s[3] -sub v25.4s, v24.4s, v2.4s -mul v5.4S, v5.4S,v17.s[2] -add v24.4s, v24.4s, v2.4s -mla v0.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v12.4s -mla v5.4S, v10.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -mul v15.4S, v15.4S,v17.s[1] -sub v12.4s, v27.4s, v8.4s -mul v3.4S, v3.4S,v17.s[0] -add v27.4s, v27.4s, v8.4s -mla v15.4S, v23.4S, v31.s[0] -sub v23.4s, v14.4s, v13.4s -mla v3.4S, v16.4S, v31.s[0] -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v9.s[3] -nop -mul v25.4S, v25.4S,v28.s[3] -nop -sqrdmulh v16.4S, v24.4S, v9.s[2] -sub v8.4s, v22.4s, v0.4s -mul v24.4S, v24.4S,v28.s[2] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v9.s[1] -sub v10.4s, v21.4s, v5.4s -mul v19.4S, v19.4S,v28.s[1] -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v9.s[0] -sub v2.4s, v18.4s, v15.4s -mul v4.4S, v4.4S,v28.s[0] -add v18.4s, v18.4s, v15.4s -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v3.4s -sqrdmulh v15.4S, v12.4S, v7.s[3] -add v20.4s, v20.4s, v3.4s -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v25.4s -sqrdmulh v3.4S, v27.4S, v7.s[2] -add v8.4s, v8.4s, v25.4s -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v24.4s -sqrdmulh v25.4S, v23.4S, v7.s[1] -add v22.4s, v22.4s, v24.4s -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v19.4s -sqrdmulh v24.4S, v14.4S, v7.s[0] -add v10.4s, v10.4s, v19.4s -mul v12.4S, v12.4S,v1.s[3] -sub v19.4s, v21.4s, v4.4s -mul v27.4S, v27.4S,v1.s[2] -add v21.4s, v21.4s, v4.4s -mla v12.4S, v15.4S, v31.s[0] -str q16, [x0, #1008] -mla v27.4S, v3.4S, v31.s[0] -str q8, [x0, #944] -mul v23.4S, v23.4S,v1.s[1] -str q0, [x0, #880] -mul v14.4S, v14.4S,v1.s[0] -str q22, [x0, #816] -mla v23.4S, v25.4S, v31.s[0] -str q5, [x0, #752] -mla v14.4S, v24.4S, v31.s[0] -str q10, [x0, #688] -ldr q10, [x0, #960] -sqrdmulh v24.4S, v10.4S, v29.s[0] -str q19, [x0, #624] -mul v10.4S, v10.4S,v30.s[0] -sub v19.4s, v2.4s, v12.4s -ldr q5, [x0, #896] -sqrdmulh v25.4S, v5.4S, v29.s[0] -str q21, [x0, #560] -mul v5.4S, v5.4S,v30.s[0] -add v2.4s, v2.4s, v12.4s -ldr q12, [x0, #832] -sqrdmulh v21.4S, v12.4S, v29.s[0] -str q19, [x0, #496] -mul v12.4S, v12.4S,v30.s[0] -sub v19.4s, v18.4s, v27.4s -ldr q22, [x0, #768] -sqrdmulh v0.4S, v22.4S, v29.s[0] -str q2, [x0, #432] -mul v22.4S, v22.4S,v30.s[0] -add v18.4s, v18.4s, v27.4s -ldr q27, [x0, #704] -mla v10.4S, v24.4S, v31.s[0] -str q19, [x0, #368] -sqrdmulh v19.4S, v27.4S, v29.s[0] -sub v24.4s, v13.4s, v23.4s -ldr q2, [x0, #640] -mla v5.4S, v25.4S, v31.s[0] -str q18, [x0, #304] -sqrdmulh v18.4S, v2.4S, v29.s[0] -add v13.4s, v13.4s, v23.4s -ldr q23, [x0, #576] -mla v12.4S, v21.4S, v31.s[0] -str q24, [x0, #240] -sqrdmulh v24.4S, v23.4S, v29.s[0] -sub v21.4s, v20.4s, v14.4s -ldr q25, [x0, #512] -mla v22.4S, v0.4S, v31.s[0] -str q13, [x0, #176] -sqrdmulh v13.4S, v25.4S, v29.s[0] -add v20.4s, v20.4s, v14.4s -ldr q14, [x0, #448] -ldr q0, [x0, #384] -mul v27.4S, v27.4S,v30.s[0] -sub v8.4s, v14.4s, v10.4s -mul v2.4S, v2.4S,v30.s[0] -add v14.4s, v14.4s, v10.4s -ldr q10, [x0, #320] -ldr q3, [x0, #256] -mla v27.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v5.4s -mla v2.4S, v18.4S, v31.s[0] -add v0.4s, v0.4s, v5.4s -ldr q5, [x0, #192] -ldr q18, [x0, #128] -mul v23.4S, v23.4S,v30.s[0] -sub v16.4s, v10.4s, v12.4s -mul v25.4S, v25.4S,v30.s[0] -add v10.4s, v10.4s, v12.4s -ldr q12, [x0, #64] -ldr q15, [x0, #0] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v22.4s -mla v25.4S, v13.4S, v31.s[0] -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v29.s[2] -nop -mul v8.4S, v8.4S,v30.s[2] -nop -sqrdmulh v13.4S, v19.4S, v29.s[2] -sub v4.4s, v5.4s, v27.4s -mul v19.4S, v19.4S,v30.s[2] -add v5.4s, v5.4s, v27.4s -sqrdmulh v27.4S, v14.4S, v29.s[1] -sub v26.4s, v18.4s, v2.4s -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v29.s[1] -sub v6.4s, v12.4s, v23.4s -mul v0.4S, v0.4S,v30.s[1] -add v12.4s, v12.4s, v23.4s -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v25.4s -sqrdmulh v23.4S, v16.4S, v29.s[2] -add v15.4s, v15.4s, v25.4s -mla v19.4S, v13.4S, v31.s[0] -str q21, [x0, #112] -sqrdmulh v21.4S, v24.4S, v29.s[2] -nop -mla v14.4S, v27.4S, v31.s[0] -str q20, [x0, #48] -sqrdmulh v20.4S, v10.4S, v29.s[1] -nop -mla v0.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v3.4S, v29.s[1] -nop -mul v16.4S, v16.4S,v30.s[2] -sub v27.4s, v4.4s, v8.4s -mul v24.4S, v24.4S,v30.s[2] -add v4.4s, v4.4s, v8.4s -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v26.4s, v19.4s -mla v24.4S, v21.4S, v31.s[0] -add v26.4s, v26.4s, v19.4s -mul v10.4S, v10.4S,v30.s[1] -sub v19.4s, v5.4s, v14.4s -mul v3.4S, v3.4S,v30.s[1] -add v5.4s, v5.4s, v14.4s -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v0.4s -mla v3.4S, v2.4S, v31.s[0] -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v11.s[3] -nop -mul v27.4S, v27.4S,v17.s[3] -nop -sqrdmulh v2.4S, v4.4S, v11.s[2] -sub v14.4s, v6.4s, v16.4s -mul v4.4S, v4.4S,v17.s[2] -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v11.s[1] -sub v21.4s, v22.4s, v24.4s -mul v19.4S, v19.4S,v17.s[1] -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v11.s[0] -sub v8.4s, v12.4s, v10.4s -mul v5.4S, v5.4S,v17.s[0] -add v12.4s, v12.4s, v10.4s -mla v27.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v3.4s -sqrdmulh v10.4S, v23.4S, v11.s[3] -add v15.4s, v15.4s, v3.4s -mla v4.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v26.4S, v11.s[2] -nop -mla v19.4S, v16.4S, v31.s[0] -nop -sqrdmulh v16.4S, v20.4S, v11.s[1] -nop -mla v5.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v18.4S, v11.s[0] -nop -mul v23.4S, v23.4S,v17.s[3] -sub v3.4s, v14.4s, v27.4s -mul v26.4S, v26.4S,v17.s[2] -add v14.4s, v14.4s, v27.4s -mla v23.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v4.4s -mla v26.4S, v2.4S, v31.s[0] -add v6.4s, v6.4s, v4.4s -mul v20.4S, v20.4S,v17.s[1] -sub v4.4s, v8.4s, v19.4s -mul v18.4S, v18.4S,v17.s[0] -add v8.4s, v8.4s, v19.4s -mla v20.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v5.4s -mla v18.4S, v24.4S, v31.s[0] -add v12.4s, v12.4s, v5.4s -sqrdmulh v5.4S, v3.4S, v9.s[3] -nop -mul v3.4S, v3.4S,v28.s[3] -nop -sqrdmulh v24.4S, v14.4S, v9.s[2] -sub v19.4s, v21.4s, v23.4s -mul v14.4S, v14.4S,v28.s[2] -add v21.4s, v21.4s, v23.4s -sqrdmulh v23.4S, v10.4S, v9.s[1] -sub v2.4s, v22.4s, v26.4s -mul v10.4S, v10.4S,v28.s[1] -add v22.4s, v22.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[0] -sub v27.4s, v0.4s, v20.4s -mul v6.4S, v6.4S,v28.s[0] -add v0.4s, v0.4s, v20.4s -mla v3.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v18.4s -sqrdmulh v20.4S, v4.4S, v7.s[3] -add v15.4s, v15.4s, v18.4s -mla v14.4S, v24.4S, v31.s[0] -sub v24.4s, v19.4s, v3.4s -sqrdmulh v18.4S, v8.4S, v7.s[2] -add v19.4s, v19.4s, v3.4s -mla v10.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v14.4s -sqrdmulh v3.4S, v16.4S, v7.s[1] -add v21.4s, v21.4s, v14.4s -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v10.4s -sqrdmulh v14.4S, v12.4S, v7.s[0] -add v2.4s, v2.4s, v10.4s -mul v4.4S, v4.4S,v1.s[3] -sub v10.4s, v22.4s, v6.4s -mul v8.4S, v8.4S,v1.s[2] -add v22.4s, v22.4s, v6.4s -mla v4.4S, v20.4S, v31.s[0] -str q24, [x0, #960] -mla v8.4S, v18.4S, v31.s[0] -str q19, [x0, #896] -mul v16.4S, v16.4S,v1.s[1] -str q23, [x0, #832] -mul v12.4S, v12.4S,v1.s[0] -str q21, [x0, #768] -mla v16.4S, v3.4S, v31.s[0] -str q26, [x0, #704] -mla v12.4S, v14.4S, v31.s[0] -str q2, [x0, #640] -ldr q2, [x0, #976] -sqrdmulh v14.4S, v2.4S, v29.s[0] -str q10, [x0, #576] -mul v2.4S, v2.4S,v30.s[0] -sub v10.4s, v27.4s, v4.4s -ldr q26, [x0, #912] -sqrdmulh v3.4S, v26.4S, v29.s[0] -str q22, [x0, #512] -mul v26.4S, v26.4S,v30.s[0] -add v27.4s, v27.4s, v4.4s -ldr q4, [x0, #848] -sqrdmulh v22.4S, v4.4S, v29.s[0] -str q10, [x0, #448] -mul v4.4S, v4.4S,v30.s[0] -sub v10.4s, v0.4s, v8.4s -ldr q21, [x0, #784] -sqrdmulh v23.4S, v21.4S, v29.s[0] -str q27, [x0, #384] -mul v21.4S, v21.4S,v30.s[0] -add v0.4s, v0.4s, v8.4s -ldr q8, [x0, #720] -mla v2.4S, v14.4S, v31.s[0] -str q10, [x0, #320] -sqrdmulh v10.4S, v8.4S, v29.s[0] -sub v14.4s, v5.4s, v16.4s -ldr q27, [x0, #656] -mla v26.4S, v3.4S, v31.s[0] -str q0, [x0, #256] -sqrdmulh v0.4S, v27.4S, v29.s[0] -add v5.4s, v5.4s, v16.4s -ldr q16, [x0, #592] -mla v4.4S, v22.4S, v31.s[0] -str q14, [x0, #192] -sqrdmulh v14.4S, v16.4S, v29.s[0] -sub v22.4s, v15.4s, v12.4s -ldr q3, [x0, #528] -mla v21.4S, v23.4S, v31.s[0] -str q5, [x0, #128] -sqrdmulh v5.4S, v3.4S, v29.s[0] -add v15.4s, v15.4s, v12.4s -ldr q12, [x0, #464] -ldr q23, [x0, #400] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v12.4s, v2.4s -mul v27.4S, v27.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #336] -ldr q18, [x0, #272] -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v23.4s, v26.4s -mla v27.4S, v0.4S, v31.s[0] -add v23.4s, v23.4s, v26.4s -ldr q26, [x0, #208] -ldr q0, [x0, #144] -mul v16.4S, v16.4S,v30.s[0] -sub v24.4s, v2.4s, v4.4s -mul v3.4S, v3.4S,v30.s[0] -add v2.4s, v2.4s, v4.4s -ldr q4, [x0, #80] -ldr q20, [x0, #16] -mla v16.4S, v14.4S, v31.s[0] -sub v14.4s, v18.4s, v21.4s -mla v3.4S, v5.4S, v31.s[0] -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v29.s[2] -nop -mul v19.4S, v19.4S,v30.s[2] -nop -sqrdmulh v5.4S, v10.4S, v29.s[2] -sub v6.4s, v26.4s, v8.4s -mul v10.4S, v10.4S,v30.s[2] -add v26.4s, v26.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v29.s[1] -sub v13.4s, v0.4s, v27.4s -mul v12.4S, v12.4S,v30.s[1] -add v0.4s, v0.4s, v27.4s -sqrdmulh v27.4S, v23.4S, v29.s[1] -sub v25.4s, v4.4s, v16.4s -mul v23.4S, v23.4S,v30.s[1] -add v4.4s, v4.4s, v16.4s -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v3.4s -sqrdmulh v16.4S, v24.4S, v29.s[2] -add v20.4s, v20.4s, v3.4s -mla v10.4S, v5.4S, v31.s[0] -str q22, [x0, #64] -sqrdmulh v22.4S, v14.4S, v29.s[2] -nop -mla v12.4S, v8.4S, v31.s[0] -str q15, [x0, #0] -sqrdmulh v15.4S, v2.4S, v29.s[1] -nop -mla v23.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v18.4S, v29.s[1] -nop -mul v24.4S, v24.4S,v30.s[2] -sub v8.4s, v6.4s, v19.4s -mul v14.4S, v14.4S,v30.s[2] -add v6.4s, v6.4s, v19.4s -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v10.4s -mla v14.4S, v22.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -mul v2.4S, v2.4S,v30.s[1] -sub v10.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v30.s[1] -add v26.4s, v26.4s, v12.4s -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v23.4s -mla v18.4S, v27.4S, v31.s[0] -add v0.4s, v0.4s, v23.4s -sqrdmulh v29.4S, v8.4S, v11.s[3] -nop -mul v8.4S, v8.4S,v17.s[3] -nop -sqrdmulh v30.4S, v6.4S, v11.s[2] -sub v23.4s, v25.4s, v24.4s -mul v6.4S, v6.4S,v17.s[2] -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v10.4S, v11.s[1] -sub v27.4s, v21.4s, v14.4s -mul v10.4S, v10.4S,v17.s[1] -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v11.s[0] -sub v12.4s, v4.4s, v2.4s -mul v26.4S, v26.4S,v17.s[0] -add v4.4s, v4.4s, v2.4s -mla v8.4S, v29.4S, v31.s[0] -sub v29.4s, v20.4s, v18.4s -sqrdmulh v2.4S, v16.4S, v11.s[3] -add v20.4s, v20.4s, v18.4s -mla v6.4S, v30.4S, v31.s[0] -nop -sqrdmulh v30.4S, v13.4S, v11.s[2] -nop -mla v10.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v15.4S, v11.s[1] -nop -mla v26.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v0.4S, v11.s[0] -nop -mul v16.4S, v16.4S,v17.s[3] -sub v18.4s, v23.4s, v8.4s -mul v13.4S, v13.4S,v17.s[2] -add v23.4s, v23.4s, v8.4s -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v6.4s -mla v13.4S, v30.4S, v31.s[0] -add v25.4s, v25.4s, v6.4s -mul v15.4S, v15.4S,v17.s[1] -sub v6.4s, v12.4s, v10.4s -mul v0.4S, v0.4S,v17.s[0] -add v12.4s, v12.4s, v10.4s -mla v15.4S, v24.4S, v31.s[0] -sub v24.4s, v4.4s, v26.4s -mla v0.4S, v14.4S, v31.s[0] -add v4.4s, v4.4s, v26.4s -sqrdmulh v11.4S, v18.4S, v9.s[3] -nop -mul v18.4S, v18.4S,v28.s[3] -nop -sqrdmulh v17.4S, v23.4S, v9.s[2] -sub v26.4s, v27.4s, v16.4s -mul v23.4S, v23.4S,v28.s[2] -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v2.4S, v9.s[1] -sub v14.4s, v21.4s, v13.4s -mul v2.4S, v2.4S,v28.s[1] -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v9.s[0] -sub v10.4s, v29.4s, v15.4s -mul v25.4S, v25.4S,v28.s[0] -add v29.4s, v29.4s, v15.4s -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v0.4s -sqrdmulh v9.4S, v6.4S, v7.s[3] -add v20.4s, v20.4s, v0.4s -mla v23.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v18.4s -sqrdmulh v0.4S, v12.4S, v7.s[2] -add v26.4s, v26.4s, v18.4s -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v23.4s -sqrdmulh v18.4S, v24.4S, v7.s[1] -add v27.4s, v27.4s, v23.4s -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v2.4s -sqrdmulh v23.4S, v4.4S, v7.s[0] -add v14.4s, v14.4s, v2.4s -mul v6.4S, v6.4S,v1.s[3] -sub v2.4s, v21.4s, v25.4s -mul v12.4S, v12.4S,v1.s[2] -add v21.4s, v21.4s, v25.4s -mla v6.4S, v9.4S, v31.s[0] -str q17, [x0, #976] -mla v12.4S, v0.4S, v31.s[0] -str q26, [x0, #912] -mul v24.4S, v24.4S,v1.s[1] -str q16, [x0, #848] -mul v4.4S, v4.4S,v1.s[0] -str q27, [x0, #784] -mla v24.4S, v18.4S, v31.s[0] -str q13, [x0, #720] -mla v4.4S, v23.4S, v31.s[0] -str q14, [x0, #656] -str q2, [x0, #592] -sub v2.4s, v10.4s, v6.4s -str q21, [x0, #528] -add v10.4s, v10.4s, v6.4s -str q2, [x0, #464] -sub v2.4s, v29.4s, v12.4s -str q10, [x0, #400] -add v29.4s, v29.4s, v12.4s -str q2, [x0, #336] -sub v2.4s, v11.4s, v24.4s -str q29, [x0, #272] -add v11.4s, v11.4s, v24.4s -str q2, [x0, #208] -sub v2.4s, v20.4s, v4.4s -str q11, [x0, #144] -add v20.4s, v20.4s, v4.4s -str q2, [x0, #80] -str q20, [x0, #16] -ldr q3, [x17, #+128] -ldr q5, [x17, #+144] -ldr q19, [x17, #+160] -ldr q22, [x17, #+176] -ldr q8, [x17, #+192] -ldr q30, [x17, #+208] -ldr q15, [x17, #+224] -ldr q28, [x17, #+240] -ldr q25, [x0, #32] -ldr q9, [x0, #48] -ldr q17, [x0, #0] -ldr q0, [x0, #16] -sqrdmulh v26.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v3.s[0] -mla v9.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v9.4s -add v0.4s, v0.4s, v9.4s -sqrdmulh v9.4S, v0.4S, v5.s[1] -mul v0.4S, v0.4S,v3.s[1] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v25.4S, v5.s[2] -mul v25.4S, v25.4S,v3.s[2] -mla v25.4S, v0.4S, v31.s[0] -sub v0.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -trn1 v25.4S, v17.4S, v9.4S -trn2 v16.4S, v17.4S, v9.4S -trn1 v27.4S, v26.4S, v0.4S -trn2 v18.4S, v26.4S, v0.4S -trn2 v26.2D, v25.2D, v27.2D -trn2 v0.2D, v16.2D, v18.2D -trn1 v17.2D, v25.2D, v27.2D -trn1 v9.2D, v16.2D, v18.2D -sqrdmulh v18.4S, v26.4S, v22.4S -mul v26.4S, v26.4S,v19.4S -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v26.4s -add v17.4s, v17.4s, v26.4s -sqrdmulh v26.4S, v0.4S, v22.4S -mul v0.4S, v0.4S,v19.4S -mla v0.4S, v26.4S, v31.s[0] -sub v26.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v30.4S -mul v9.4S, v9.4S,v8.4S -mla v9.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v26.4S, v28.4S -mul v26.4S, v26.4S,v15.4S -mla v26.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -str q17, [x0, #0] -str q0, [x0, #16] -str q18, [x0, #32] -str q9, [x0, #48] -ldr q9, [x17, #+256] -ldr q18, [x17, #+272] -ldr q0, [x17, #+288] -ldr q17, [x17, #+304] -ldr q26, [x17, #+320] -ldr q16, [x17, #+336] -ldr q27, [x17, #+352] -ldr q25, [x17, #+368] -ldr q28, [x0, #96] -ldr q15, [x0, #112] -ldr q30, [x0, #64] -ldr q8, [x0, #80] -sqrdmulh v22.4S, v28.4S, v18.s[0] -mul v28.4S, v28.4S,v9.s[0] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v18.s[0] -mul v15.4S, v15.4S,v9.s[0] -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v18.s[1] -mul v8.4S, v8.4S,v9.s[1] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v28.4S, v18.s[2] -mul v28.4S, v28.4S,v9.s[2] -mla v28.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -trn1 v28.4S, v30.4S, v15.4S -trn2 v19.4S, v30.4S, v15.4S -trn1 v5.4S, v22.4S, v8.4S -trn2 v3.4S, v22.4S, v8.4S -trn2 v22.2D, v28.2D, v5.2D -trn2 v8.2D, v19.2D, v3.2D -trn1 v30.2D, v28.2D, v5.2D -trn1 v15.2D, v19.2D, v3.2D -sqrdmulh v3.4S, v22.4S, v17.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v17.4S -mul v8.4S, v8.4S,v0.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -sqrdmulh v8.4S, v15.4S, v16.4S -mul v15.4S, v15.4S,v26.4S -mla v15.4S, v8.4S, v31.s[0] -sub v8.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v25.4S -mul v22.4S, v22.4S,v27.4S -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -str q30, [x0, #64] -str q8, [x0, #80] -str q3, [x0, #96] -str q15, [x0, #112] -ldr q15, [x17, #+384] -ldr q3, [x17, #+400] -ldr q8, [x17, #+416] -ldr q30, [x17, #+432] -ldr q22, [x17, #+448] -ldr q19, [x17, #+464] -ldr q5, [x17, #+480] -ldr q28, [x17, #+496] -ldr q25, [x0, #160] -ldr q27, [x0, #176] -ldr q16, [x0, #128] -ldr q26, [x0, #144] -sqrdmulh v17.4S, v25.4S, v3.s[0] -mul v25.4S, v25.4S,v15.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v3.s[0] -mul v27.4S, v27.4S,v15.s[0] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v26.4S, v3.s[1] -mul v26.4S, v26.4S,v15.s[1] -mla v26.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v3.s[2] -mul v25.4S, v25.4S,v15.s[2] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -trn1 v25.4S, v16.4S, v27.4S -trn2 v0.4S, v16.4S, v27.4S -trn1 v18.4S, v17.4S, v26.4S -trn2 v9.4S, v17.4S, v26.4S -trn2 v17.2D, v25.2D, v18.2D -trn2 v26.2D, v0.2D, v9.2D -trn1 v16.2D, v25.2D, v18.2D -trn1 v27.2D, v0.2D, v9.2D -sqrdmulh v9.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v8.4S -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v26.4S, v30.4S -mul v26.4S, v26.4S,v8.4S -mla v26.4S, v17.4S, v31.s[0] -sub v17.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v19.4S -mul v27.4S, v27.4S,v22.4S -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v28.4S -mul v17.4S, v17.4S,v5.4S -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -str q16, [x0, #128] -str q26, [x0, #144] -str q9, [x0, #160] -str q27, [x0, #176] -ldr q27, [x17, #+512] -ldr q9, [x17, #+528] -ldr q26, [x17, #+544] -ldr q16, [x17, #+560] -ldr q17, [x17, #+576] -ldr q0, [x17, #+592] -ldr q18, [x17, #+608] -ldr q25, [x17, #+624] -ldr q28, [x0, #224] -ldr q5, [x0, #240] -ldr q19, [x0, #192] -ldr q22, [x0, #208] -sqrdmulh v30.4S, v28.4S, v9.s[0] -mul v28.4S, v28.4S,v27.s[0] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -sqrdmulh v28.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v5.4s -add v22.4s, v22.4s, v5.4s -sqrdmulh v5.4S, v22.4S, v9.s[1] -mul v22.4S, v22.4S,v27.s[1] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v28.4S, v9.s[2] -mul v28.4S, v28.4S,v27.s[2] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -trn1 v28.4S, v19.4S, v5.4S -trn2 v8.4S, v19.4S, v5.4S -trn1 v3.4S, v30.4S, v22.4S -trn2 v15.4S, v30.4S, v22.4S -trn2 v30.2D, v28.2D, v3.2D -trn2 v22.2D, v8.2D, v15.2D -trn1 v19.2D, v28.2D, v3.2D -trn1 v5.2D, v8.2D, v15.2D -sqrdmulh v15.4S, v30.4S, v16.4S -mul v30.4S, v30.4S,v26.4S -mla v30.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v26.4S -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v5.4s, v22.4s -add v5.4s, v5.4s, v22.4s -sqrdmulh v22.4S, v5.4S, v0.4S -mul v5.4S, v5.4S,v17.4S -mla v5.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v30.4S, v25.4S -mul v30.4S, v30.4S,v18.4S -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -str q19, [x0, #192] -str q22, [x0, #208] -str q15, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q15, [x17, #+656] -ldr q22, [x17, #+672] -ldr q19, [x17, #+688] -ldr q30, [x17, #+704] -ldr q8, [x17, #+720] -ldr q3, [x17, #+736] -ldr q28, [x17, #+752] -ldr q25, [x0, #288] -ldr q18, [x0, #304] -ldr q0, [x0, #256] -ldr q17, [x0, #272] -sqrdmulh v16.4S, v25.4S, v15.s[0] -mul v25.4S, v25.4S,v5.s[0] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v18.4S, v15.s[0] -mul v18.4S, v18.4S,v5.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v18.4s -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v15.s[1] -mul v17.4S, v17.4S,v5.s[1] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v15.s[2] -mul v25.4S, v25.4S,v5.s[2] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -trn1 v25.4S, v0.4S, v18.4S -trn2 v26.4S, v0.4S, v18.4S -trn1 v9.4S, v16.4S, v17.4S -trn2 v27.4S, v16.4S, v17.4S -trn2 v16.2D, v25.2D, v9.2D -trn2 v17.2D, v26.2D, v27.2D -trn1 v0.2D, v25.2D, v9.2D -trn1 v18.2D, v26.2D, v27.2D -sqrdmulh v27.4S, v16.4S, v19.4S -mul v16.4S, v16.4S,v22.4S -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v19.4S -mul v17.4S, v17.4S,v22.4S -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v8.4S -mul v18.4S, v18.4S,v30.4S -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v28.4S -mul v16.4S, v16.4S,v3.4S -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -str q0, [x0, #256] -str q17, [x0, #272] -str q27, [x0, #288] -str q18, [x0, #304] -ldr q18, [x17, #+768] -ldr q27, [x17, #+784] -ldr q17, [x17, #+800] -ldr q0, [x17, #+816] -ldr q16, [x17, #+832] -ldr q26, [x17, #+848] -ldr q9, [x17, #+864] -ldr q25, [x17, #+880] -ldr q28, [x0, #352] -ldr q3, [x0, #368] -ldr q8, [x0, #320] -ldr q30, [x0, #336] -sqrdmulh v19.4S, v28.4S, v27.s[0] -mul v28.4S, v28.4S,v18.s[0] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v8.4s, v28.4s -add v8.4s, v8.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v18.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v3.4s -add v30.4s, v30.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v27.s[1] -mul v30.4S, v30.4S,v18.s[1] -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v27.s[2] -mul v28.4S, v28.4S,v18.s[2] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -trn1 v28.4S, v8.4S, v3.4S -trn2 v22.4S, v8.4S, v3.4S -trn1 v15.4S, v19.4S, v30.4S -trn2 v5.4S, v19.4S, v30.4S -trn2 v19.2D, v28.2D, v15.2D -trn2 v30.2D, v22.2D, v5.2D -trn1 v8.2D, v28.2D, v15.2D -trn1 v3.2D, v22.2D, v5.2D -sqrdmulh v5.4S, v19.4S, v0.4S -mul v19.4S, v19.4S,v17.4S -mla v19.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v0.4S -mul v30.4S, v30.4S,v17.4S -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v30.4s -add v3.4s, v3.4s, v30.4s -sqrdmulh v30.4S, v3.4S, v26.4S -mul v3.4S, v3.4S,v16.4S -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v25.4S -mul v19.4S, v19.4S,v9.4S -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -str q8, [x0, #320] -str q30, [x0, #336] -str q5, [x0, #352] -str q3, [x0, #368] -ldr q3, [x17, #+896] -ldr q5, [x17, #+912] -ldr q30, [x17, #+928] -ldr q8, [x17, #+944] -ldr q19, [x17, #+960] -ldr q22, [x17, #+976] -ldr q15, [x17, #+992] -ldr q28, [x17, #+1008] -ldr q25, [x0, #416] -ldr q9, [x0, #432] -ldr q26, [x0, #384] -ldr q16, [x0, #400] -sqrdmulh v0.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v0.4S, v31.s[0] -sub v0.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v3.s[0] -mla v9.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v5.s[1] -mul v16.4S, v16.4S,v3.s[1] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v5.s[2] -mul v25.4S, v25.4S,v3.s[2] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -trn1 v25.4S, v26.4S, v9.4S -trn2 v17.4S, v26.4S, v9.4S -trn1 v27.4S, v0.4S, v16.4S -trn2 v18.4S, v0.4S, v16.4S -trn2 v0.2D, v25.2D, v27.2D -trn2 v16.2D, v17.2D, v18.2D -trn1 v26.2D, v25.2D, v27.2D -trn1 v9.2D, v17.2D, v18.2D -sqrdmulh v18.4S, v0.4S, v8.4S -mul v0.4S, v0.4S,v30.4S -mla v0.4S, v18.4S, v31.s[0] -sub v18.4s, v26.4s, v0.4s -add v26.4s, v26.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v8.4S -mul v16.4S, v16.4S,v30.4S -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v22.4S -mul v9.4S, v9.4S,v19.4S -mla v9.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v9.4s -add v26.4s, v26.4s, v9.4s -sqrdmulh v9.4S, v0.4S, v28.4S -mul v0.4S, v0.4S,v15.4S -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -str q26, [x0, #384] -str q16, [x0, #400] -str q18, [x0, #416] -str q9, [x0, #432] -ldr q9, [x17, #+1024] -ldr q18, [x17, #+1040] -ldr q16, [x17, #+1056] -ldr q26, [x17, #+1072] -ldr q0, [x17, #+1088] -ldr q17, [x17, #+1104] -ldr q27, [x17, #+1120] -ldr q25, [x17, #+1136] -ldr q28, [x0, #480] -ldr q15, [x0, #496] -ldr q22, [x0, #448] -ldr q19, [x0, #464] -sqrdmulh v8.4S, v28.4S, v18.s[0] -mul v28.4S, v28.4S,v9.s[0] -mla v28.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v18.s[0] -mul v15.4S, v15.4S,v9.s[0] -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v18.s[1] -mul v19.4S, v19.4S,v9.s[1] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v18.s[2] -mul v28.4S, v28.4S,v9.s[2] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v8.4s, v28.4s -add v8.4s, v8.4s, v28.4s -trn1 v28.4S, v22.4S, v15.4S -trn2 v30.4S, v22.4S, v15.4S -trn1 v5.4S, v8.4S, v19.4S -trn2 v3.4S, v8.4S, v19.4S -trn2 v8.2D, v28.2D, v5.2D -trn2 v19.2D, v30.2D, v3.2D -trn1 v22.2D, v28.2D, v5.2D -trn1 v15.2D, v30.2D, v3.2D -sqrdmulh v3.4S, v8.4S, v26.4S -mul v8.4S, v8.4S,v16.4S -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v8.4s -add v22.4s, v22.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v26.4S -mul v19.4S, v19.4S,v16.4S -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v15.4S, v17.4S -mul v15.4S, v15.4S,v0.4S -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v25.4S -mul v8.4S, v8.4S,v27.4S -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -str q22, [x0, #448] -str q19, [x0, #464] -str q3, [x0, #480] -str q15, [x0, #496] -ldr q15, [x17, #+1152] -ldr q3, [x17, #+1168] -ldr q19, [x17, #+1184] -ldr q22, [x17, #+1200] -ldr q8, [x17, #+1216] -ldr q30, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q28, [x17, #+1264] -ldr q25, [x0, #544] -ldr q27, [x0, #560] -ldr q17, [x0, #512] -ldr q0, [x0, #528] -sqrdmulh v26.4S, v25.4S, v3.s[0] -mul v25.4S, v25.4S,v15.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v3.s[0] -mul v27.4S, v27.4S,v15.s[0] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v27.4s -add v0.4s, v0.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v3.s[1] -mul v0.4S, v0.4S,v15.s[1] -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v25.4S, v3.s[2] -mul v25.4S, v25.4S,v15.s[2] -mla v25.4S, v0.4S, v31.s[0] -sub v0.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -trn1 v25.4S, v17.4S, v27.4S -trn2 v16.4S, v17.4S, v27.4S -trn1 v18.4S, v26.4S, v0.4S -trn2 v9.4S, v26.4S, v0.4S -trn2 v26.2D, v25.2D, v18.2D -trn2 v0.2D, v16.2D, v9.2D -trn1 v17.2D, v25.2D, v18.2D -trn1 v27.2D, v16.2D, v9.2D -sqrdmulh v9.4S, v26.4S, v22.4S -mul v26.4S, v26.4S,v19.4S -mla v26.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v26.4s -add v17.4s, v17.4s, v26.4s -sqrdmulh v26.4S, v0.4S, v22.4S -mul v0.4S, v0.4S,v19.4S -mla v0.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v0.4s -add v27.4s, v27.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v30.4S -mul v27.4S, v27.4S,v8.4S -mla v27.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v26.4S, v28.4S -mul v26.4S, v26.4S,v5.4S -mla v26.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v26.4s -add v9.4s, v9.4s, v26.4s -str q17, [x0, #512] -str q0, [x0, #528] -str q9, [x0, #544] -str q27, [x0, #560] -ldr q27, [x17, #+1280] -ldr q9, [x17, #+1296] -ldr q0, [x17, #+1312] -ldr q17, [x17, #+1328] -ldr q26, [x17, #+1344] -ldr q16, [x17, #+1360] -ldr q18, [x17, #+1376] -ldr q25, [x17, #+1392] -ldr q28, [x0, #608] -ldr q5, [x0, #624] -ldr q30, [x0, #576] -ldr q8, [x0, #592] -sqrdmulh v22.4S, v28.4S, v9.s[0] -mul v28.4S, v28.4S,v27.s[0] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v5.4s -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v27.s[1] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v28.4S, v9.s[2] -mul v28.4S, v28.4S,v27.s[2] -mla v28.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -trn1 v28.4S, v30.4S, v5.4S -trn2 v19.4S, v30.4S, v5.4S -trn1 v3.4S, v22.4S, v8.4S -trn2 v15.4S, v22.4S, v8.4S -trn2 v22.2D, v28.2D, v3.2D -trn2 v8.2D, v19.2D, v15.2D -trn1 v30.2D, v28.2D, v3.2D -trn1 v5.2D, v19.2D, v15.2D -sqrdmulh v15.4S, v22.4S, v17.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v17.4S -mul v8.4S, v8.4S,v0.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v16.4S -mul v5.4S, v5.4S,v26.4S -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v22.4S, v25.4S -mul v22.4S, v22.4S,v18.4S -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v22.4s -add v15.4s, v15.4s, v22.4s -str q30, [x0, #576] -str q8, [x0, #592] -str q15, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q15, [x17, #+1424] -ldr q8, [x17, #+1440] -ldr q30, [x17, #+1456] -ldr q22, [x17, #+1472] -ldr q19, [x17, #+1488] -ldr q3, [x17, #+1504] -ldr q28, [x17, #+1520] -ldr q25, [x0, #672] -ldr q18, [x0, #688] -ldr q16, [x0, #640] -ldr q26, [x0, #656] -sqrdmulh v17.4S, v25.4S, v15.s[0] -mul v25.4S, v25.4S,v5.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -sqrdmulh v25.4S, v18.4S, v15.s[0] -mul v18.4S, v18.4S,v5.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v18.4s -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v15.s[1] -mul v26.4S, v26.4S,v5.s[1] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v15.s[2] -mul v25.4S, v25.4S,v5.s[2] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -trn1 v25.4S, v16.4S, v18.4S -trn2 v0.4S, v16.4S, v18.4S -trn1 v9.4S, v17.4S, v26.4S -trn2 v27.4S, v17.4S, v26.4S -trn2 v17.2D, v25.2D, v9.2D -trn2 v26.2D, v0.2D, v27.2D -trn1 v16.2D, v25.2D, v9.2D -trn1 v18.2D, v0.2D, v27.2D -sqrdmulh v27.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v8.4S -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v26.4S, v30.4S -mul v26.4S, v26.4S,v8.4S -mla v26.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v19.4S -mul v18.4S, v18.4S,v22.4S -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v28.4S -mul v17.4S, v17.4S,v3.4S -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v27.4s, v17.4s -add v27.4s, v27.4s, v17.4s -str q16, [x0, #640] -str q26, [x0, #656] -str q27, [x0, #672] -str q18, [x0, #688] -ldr q18, [x17, #+1536] -ldr q27, [x17, #+1552] -ldr q26, [x17, #+1568] -ldr q16, [x17, #+1584] -ldr q17, [x17, #+1600] -ldr q0, [x17, #+1616] -ldr q9, [x17, #+1632] -ldr q25, [x17, #+1648] -ldr q28, [x0, #736] -ldr q3, [x0, #752] -ldr q19, [x0, #704] -ldr q22, [x0, #720] -sqrdmulh v30.4S, v28.4S, v27.s[0] -mul v28.4S, v28.4S,v18.s[0] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v18.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v27.s[1] -mul v22.4S, v22.4S,v18.s[1] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v28.4S, v27.s[2] -mul v28.4S, v28.4S,v18.s[2] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -trn1 v28.4S, v19.4S, v3.4S -trn2 v8.4S, v19.4S, v3.4S -trn1 v15.4S, v30.4S, v22.4S -trn2 v5.4S, v30.4S, v22.4S -trn2 v30.2D, v28.2D, v15.2D -trn2 v22.2D, v8.2D, v5.2D -trn1 v19.2D, v28.2D, v15.2D -trn1 v3.2D, v8.2D, v5.2D -sqrdmulh v5.4S, v30.4S, v16.4S -mul v30.4S, v30.4S,v26.4S -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v26.4S -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v0.4S -mul v3.4S, v3.4S,v17.4S -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v25.4S -mul v30.4S, v30.4S,v9.4S -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -str q19, [x0, #704] -str q22, [x0, #720] -str q5, [x0, #736] -str q3, [x0, #752] -ldr q3, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q22, [x17, #+1696] -ldr q19, [x17, #+1712] -ldr q30, [x17, #+1728] -ldr q8, [x17, #+1744] -ldr q15, [x17, #+1760] -ldr q28, [x17, #+1776] -ldr q25, [x0, #800] -ldr q9, [x0, #816] -ldr q0, [x0, #768] -ldr q17, [x0, #784] -sqrdmulh v16.4S, v25.4S, v5.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v3.s[0] -mla v9.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v5.s[1] -mul v17.4S, v17.4S,v3.s[1] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v5.s[2] -mul v25.4S, v25.4S,v3.s[2] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -trn1 v25.4S, v0.4S, v9.4S -trn2 v26.4S, v0.4S, v9.4S -trn1 v27.4S, v16.4S, v17.4S -trn2 v18.4S, v16.4S, v17.4S -trn2 v16.2D, v25.2D, v27.2D -trn2 v17.2D, v26.2D, v18.2D -trn1 v0.2D, v25.2D, v27.2D -trn1 v9.2D, v26.2D, v18.2D -sqrdmulh v18.4S, v16.4S, v19.4S -mul v16.4S, v16.4S,v22.4S -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v19.4S -mul v17.4S, v17.4S,v22.4S -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v8.4S -mul v9.4S, v9.4S,v30.4S -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v9.4s -add v0.4s, v0.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v28.4S -mul v16.4S, v16.4S,v15.4S -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -str q0, [x0, #768] -str q17, [x0, #784] -str q18, [x0, #800] -str q9, [x0, #816] -ldr q9, [x17, #+1792] -ldr q18, [x17, #+1808] -ldr q17, [x17, #+1824] -ldr q0, [x17, #+1840] -ldr q16, [x17, #+1856] -ldr q26, [x17, #+1872] -ldr q27, [x17, #+1888] -ldr q25, [x17, #+1904] -ldr q28, [x0, #864] -ldr q15, [x0, #880] -ldr q8, [x0, #832] -ldr q30, [x0, #848] -sqrdmulh v19.4S, v28.4S, v18.s[0] -mul v28.4S, v28.4S,v9.s[0] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v8.4s, v28.4s -add v8.4s, v8.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v18.s[0] -mul v15.4S, v15.4S,v9.s[0] -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -sqrdmulh v15.4S, v30.4S, v18.s[1] -mul v30.4S, v30.4S,v9.s[1] -mla v30.4S, v15.4S, v31.s[0] -sub v15.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v18.s[2] -mul v28.4S, v28.4S,v9.s[2] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -trn1 v28.4S, v8.4S, v15.4S -trn2 v22.4S, v8.4S, v15.4S -trn1 v5.4S, v19.4S, v30.4S -trn2 v3.4S, v19.4S, v30.4S -trn2 v19.2D, v28.2D, v5.2D -trn2 v30.2D, v22.2D, v3.2D -trn1 v8.2D, v28.2D, v5.2D -trn1 v15.2D, v22.2D, v3.2D -sqrdmulh v3.4S, v19.4S, v0.4S -mul v19.4S, v19.4S,v17.4S -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v0.4S -mul v30.4S, v30.4S,v17.4S -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v16.4S -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v25.4S -mul v19.4S, v19.4S,v27.4S -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -str q8, [x0, #832] -str q30, [x0, #848] -str q3, [x0, #864] -str q15, [x0, #880] -ldr q15, [x17, #+1920] -ldr q3, [x17, #+1936] -ldr q30, [x17, #+1952] -ldr q8, [x17, #+1968] -ldr q19, [x17, #+1984] -ldr q22, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q28, [x17, #+2032] -ldr q25, [x0, #928] -ldr q27, [x0, #944] -ldr q26, [x0, #896] -ldr q16, [x0, #912] -sqrdmulh v0.4S, v25.4S, v3.s[0] -mul v25.4S, v25.4S,v15.s[0] -mla v25.4S, v0.4S, v31.s[0] -sub v0.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v3.s[0] -mul v27.4S, v27.4S,v15.s[0] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sqrdmulh v27.4S, v16.4S, v3.s[1] -mul v16.4S, v16.4S,v15.s[1] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v3.s[2] -mul v25.4S, v25.4S,v15.s[2] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -trn1 v25.4S, v26.4S, v27.4S -trn2 v17.4S, v26.4S, v27.4S -trn1 v18.4S, v0.4S, v16.4S -trn2 v9.4S, v0.4S, v16.4S -trn2 v0.2D, v25.2D, v18.2D -trn2 v16.2D, v17.2D, v9.2D -trn1 v26.2D, v25.2D, v18.2D -trn1 v27.2D, v17.2D, v9.2D -sqrdmulh v9.4S, v0.4S, v8.4S -mul v0.4S, v0.4S,v30.4S -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v26.4s, v0.4s -add v26.4s, v26.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v8.4S -mul v16.4S, v16.4S,v30.4S -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v27.4S, v22.4S -mul v27.4S, v27.4S,v19.4S -mla v27.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v28.4S -mul v0.4S, v0.4S,v5.4S -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -str q26, [x0, #896] -str q16, [x0, #912] -str q9, [x0, #928] -str q27, [x0, #944] -ldr q27, [x17, #+2048] -ldr q9, [x17, #+2064] -ldr q16, [x17, #+2080] -ldr q26, [x17, #+2096] -ldr q0, [x17, #+2112] -ldr q17, [x17, #+2128] -ldr q18, [x17, #+2144] -ldr q25, [x17, #+2160] -ldr q28, [x0, #992] -ldr q5, [x0, #1008] -ldr q22, [x0, #960] -ldr q19, [x0, #976] -sqrdmulh v8.4S, v28.4S, v9.s[0] -mul v28.4S, v28.4S,v27.s[0] -mla v28.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -sqrdmulh v28.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v27.s[0] -mla v5.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v19.4S, v9.s[1] -mul v19.4S, v19.4S,v27.s[1] -mla v19.4S, v5.4S, v31.s[0] -sub v5.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[2] -mul v28.4S, v28.4S,v27.s[2] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v8.4s, v28.4s -add v8.4s, v8.4s, v28.4s -trn1 v28.4S, v22.4S, v5.4S -trn2 v30.4S, v22.4S, v5.4S -trn1 v3.4S, v8.4S, v19.4S -trn2 v15.4S, v8.4S, v19.4S -trn2 v8.2D, v28.2D, v3.2D -trn2 v19.2D, v30.2D, v15.2D -trn1 v22.2D, v28.2D, v3.2D -trn1 v5.2D, v30.2D, v15.2D -sqrdmulh v15.4S, v8.4S, v26.4S -mul v8.4S, v8.4S,v16.4S -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v8.4s -add v22.4s, v22.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v26.4S -mul v19.4S, v19.4S,v16.4S -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -sqrdmulh v19.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v0.4S -mla v5.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v5.4s -add v22.4s, v22.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v25.4S -mul v8.4S, v8.4S,v18.4S -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -str q22, [x0, #960] -str q19, [x0, #976] -str q15, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2456 -// Instruction count: 2452 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s deleted file mode 100644 index 5783747..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_18_0.s +++ /dev/null @@ -1,2486 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_18_0 -.global _ntt_u32_full_neon_asm_var_4_4_18_0 -ntt_u32_full_neon_asm_var_4_4_18_0: -_ntt_u32_full_neon_asm_var_4_4_18_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -ldr q3, [x0, #416] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v2.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -ldr q28, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -sub v0.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v26.4s, v20.4s -nop -sqrdmulh v14.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -add v26.4s, v26.4s, v20.4s -nop -sqrdmulh v20.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -sqrdmulh v25.4S, v0.4S, v29.s[2] -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v17.4S, v29.s[2] -mla v19.4S, v14.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -nop -sqrdmulh v23.4S, v28.4S, v29.s[1] -mla v21.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v1.4S, v29.s[1] -mla v3.4S, v27.4S, v31.s[0] -nop -nop -ldr q27, [x17, #+32] -ldr q14, [x17, #+48] -mul v17.4S, v17.4S,v30.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v10.4s, v16.4s, v2.4s -add v16.4s, v16.4s, v2.4s -mla v17.4S, v11.4S, v31.s[0] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -mul v1.4S, v1.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v19.4s, v26.4s, v21.4s -add v26.4s, v26.4s, v21.4s -mla v1.4S, v20.4S, v31.s[0] -mla v28.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v14.s[3] -mul v10.4S, v10.4S,v27.s[3] -nop -nop -sqrdmulh v20.4S, v16.4S, v14.s[2] -mul v16.4S, v16.4S,v27.s[2] -sub v21.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v14.s[1] -mul v19.4S, v19.4S,v27.s[1] -sub v11.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v26.4S, v14.s[0] -mul v26.4S, v26.4S,v27.s[0] -sub v2.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -ldr q28, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v25.4S, v14.s[3] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v14.s[2] -mla v16.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v23.4S, v14.s[1] -mla v19.4S, v0.4S, v31.s[0] -nop -nop -sqrdmulh v0.4S, v18.4S, v14.s[0] -mla v26.4S, v17.4S, v31.s[0] -nop -nop -ldr q17, [x17, #+64] -ldr q7, [x17, #+80] -mul v13.4S, v13.4S,v27.s[2] -mul v25.4S, v25.4S,v27.s[3] -sub v6.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v1.4S, v31.s[0] -mla v25.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -mul v18.4S, v18.4S,v27.s[0] -mul v23.4S, v23.4S,v27.s[1] -sub v16.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -mla v18.4S, v0.4S, v31.s[0] -mla v23.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v28.s[3] -nop -nop -sqrdmulh v0.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v19.4s, v11.4s, v25.4s -add v11.4s, v11.4s, v25.4s -sqrdmulh v25.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v1.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v9.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v10.4s, v3.4s, v23.4s -add v3.4s, v3.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v7.s[3] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v7.s[2] -mla v21.4S, v0.4S, v31.s[0] -sub v0.4s, v19.4s, v6.4s -str q0, [x0, #992] -sqrdmulh v0.4S, v20.4S, v7.s[1] -mla v8.4S, v25.4S, v31.s[0] -add v19.4s, v19.4s, v6.4s -str q19, [x0, #928] -sqrdmulh v19.4S, v24.4S, v7.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -str q13, [x0, #864] -mul v2.4S, v2.4S,v17.s[2] -mul v16.4S, v16.4S,v17.s[3] -add v11.4s, v11.4s, v21.4s -sub v21.4s, v1.4s, v8.4s -mla v2.4S, v18.4S, v31.s[0] -mla v16.4S, v23.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -str q11, [x0, #800] -mul v24.4S, v24.4S,v17.s[0] -mul v20.4S, v20.4S,v17.s[1] -sub v11.4s, v22.4s, v12.4s -str q21, [x0, #736] -mla v24.4S, v19.4S, v31.s[0] -mla v20.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -str q1, [x0, #672] -ldr q1, [x0, #1008] -sqrdmulh v12.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -str q11, [x0, #608] -sub v11.4s, v10.4s, v16.4s -ldr q0, [x0, #944] -sqrdmulh v19.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -str q22, [x0, #544] -add v10.4s, v10.4s, v16.4s -ldr q16, [x0, #880] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q11, [x0, #480] -sub v11.4s, v3.4s, v2.4s -ldr q21, [x0, #816] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -str q10, [x0, #416] -add v3.4s, v3.4s, v2.4s -ldr q2, [x0, #752] -sqrdmulh v10.4S, v2.4S, v29.s[0] -mla v1.4S, v12.4S, v31.s[0] -str q11, [x0, #352] -sub v11.4s, v26.4s, v20.4s -ldr q12, [x0, #688] -sqrdmulh v23.4S, v12.4S, v29.s[0] -mla v0.4S, v19.4S, v31.s[0] -str q3, [x0, #288] -add v26.4s, v26.4s, v20.4s -ldr q20, [x0, #624] -sqrdmulh v3.4S, v20.4S, v29.s[0] -mla v16.4S, v22.4S, v31.s[0] -str q11, [x0, #224] -sub v11.4s, v15.4s, v24.4s -ldr q22, [x0, #560] -sqrdmulh v19.4S, v22.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -str q26, [x0, #160] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #496] -ldr q26, [x0, #432] -mul v12.4S, v12.4S,v30.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v8.4s, v24.4s, v1.4s -add v24.4s, v24.4s, v1.4s -ldr q1, [x0, #368] -ldr q18, [x0, #304] -mla v12.4S, v23.4S, v31.s[0] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v0.4s -add v26.4s, v26.4s, v0.4s -ldr q0, [x0, #240] -ldr q23, [x0, #176] -mul v22.4S, v22.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v13.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -ldr q16, [x0, #112] -ldr q6, [x0, #48] -mla v22.4S, v19.4S, v31.s[0] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v19.4s, v0.4s, v2.4s -nop -sqrdmulh v25.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v0.4s, v0.4s, v2.4s -nop -sqrdmulh v2.4S, v24.4S, v29.s[1] -mul v24.4S, v24.4S,v30.s[1] -sub v5.4s, v23.4s, v12.4s -add v23.4s, v23.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v29.s[1] -mul v26.4S, v26.4S,v30.s[1] -sub v4.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v29.s[2] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v6.4s, v22.4s -str q11, [x0, #96] -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v10.4S, v25.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -nop -sqrdmulh v22.4S, v1.4S, v29.s[1] -mla v24.4S, v2.4S, v31.s[0] -str q15, [x0, #32] -nop -sqrdmulh v15.4S, v18.4S, v29.s[1] -mla v26.4S, v12.4S, v31.s[0] -nop -nop -mul v3.4S, v3.4S,v30.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v12.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v3.4S, v11.4S, v31.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -mul v18.4S, v18.4S,v30.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v10.4s, v0.4s, v24.4s -add v0.4s, v0.4s, v24.4s -mla v18.4S, v15.4S, v31.s[0] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v23.4s, v26.4s -add v23.4s, v23.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v14.s[3] -mul v12.4S, v12.4S,v27.s[3] -nop -nop -sqrdmulh v15.4S, v19.4S, v14.s[2] -mul v19.4S, v19.4S,v27.s[2] -sub v24.4s, v4.4s, v13.4s -add v4.4s, v4.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v14.s[1] -mul v10.4S, v10.4S,v27.s[1] -sub v11.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v14.s[0] -mul v0.4S, v0.4S,v27.s[0] -sub v8.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v14.s[3] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v6.4s, v18.4s -add v6.4s, v6.4s, v18.4s -sqrdmulh v18.4S, v5.4S, v14.s[2] -mla v19.4S, v15.4S, v31.s[0] -nop -nop -sqrdmulh v15.4S, v22.4S, v14.s[1] -mla v10.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v23.4S, v14.s[0] -mla v0.4S, v3.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v27.s[2] -mul v20.4S, v20.4S,v27.s[3] -sub v3.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -mla v5.4S, v18.4S, v31.s[0] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v19.4s -add v4.4s, v4.4s, v19.4s -mul v23.4S, v23.4S,v27.s[0] -mul v22.4S, v22.4S,v27.s[1] -sub v19.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -mla v23.4S, v13.4S, v31.s[0] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v9.s[3] -mul v3.4S, v3.4S,v28.s[3] -nop -nop -sqrdmulh v13.4S, v24.4S, v9.s[2] -mul v24.4S, v24.4S,v28.s[2] -sub v10.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v1.4S, v9.s[1] -mul v1.4S, v1.4S,v28.s[1] -sub v18.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v9.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v12.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v7.s[3] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v6.4s, v23.4s -add v6.4s, v6.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v7.s[2] -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v10.4s, v3.4s -str q13, [x0, #1008] -sqrdmulh v13.4S, v15.4S, v7.s[1] -mla v1.4S, v20.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q10, [x0, #944] -sqrdmulh v10.4S, v16.4S, v7.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v24.4s -str q5, [x0, #880] -mul v8.4S, v8.4S,v17.s[2] -mul v19.4S, v19.4S,v17.s[3] -add v11.4s, v11.4s, v24.4s -sub v24.4s, v18.4s, v1.4s -mla v8.4S, v23.4S, v31.s[0] -mla v19.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -str q11, [x0, #816] -mul v16.4S, v16.4S,v17.s[0] -mul v15.4S, v15.4S,v17.s[1] -sub v11.4s, v21.4s, v4.4s -str q24, [x0, #752] -mla v16.4S, v10.4S, v31.s[0] -mla v15.4S, v13.4S, v31.s[0] -add v21.4s, v21.4s, v4.4s -str q18, [x0, #688] -ldr q18, [x0, #960] -sqrdmulh v4.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q11, [x0, #624] -sub v11.4s, v12.4s, v19.4s -ldr q13, [x0, #896] -sqrdmulh v10.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -str q21, [x0, #560] -add v12.4s, v12.4s, v19.4s -ldr q19, [x0, #832] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -str q11, [x0, #496] -sub v11.4s, v26.4s, v8.4s -ldr q24, [x0, #768] -sqrdmulh v1.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -str q12, [x0, #432] -add v26.4s, v26.4s, v8.4s -ldr q8, [x0, #704] -sqrdmulh v12.4S, v8.4S, v29.s[0] -mla v18.4S, v4.4S, v31.s[0] -str q11, [x0, #368] -sub v11.4s, v0.4s, v15.4s -ldr q4, [x0, #640] -sqrdmulh v22.4S, v4.4S, v29.s[0] -mla v13.4S, v10.4S, v31.s[0] -str q26, [x0, #304] -add v0.4s, v0.4s, v15.4s -ldr q15, [x0, #576] -sqrdmulh v26.4S, v15.4S, v29.s[0] -mla v19.4S, v21.4S, v31.s[0] -str q11, [x0, #240] -sub v11.4s, v6.4s, v16.4s -ldr q21, [x0, #512] -sqrdmulh v10.4S, v21.4S, v29.s[0] -mla v24.4S, v1.4S, v31.s[0] -str q0, [x0, #176] -add v6.4s, v6.4s, v16.4s -ldr q16, [x0, #448] -ldr q0, [x0, #384] -mul v4.4S, v4.4S,v30.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v1.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -ldr q18, [x0, #320] -ldr q23, [x0, #256] -mla v4.4S, v22.4S, v31.s[0] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -ldr q13, [x0, #192] -ldr q22, [x0, #128] -mul v21.4S, v21.4S,v30.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v5.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -ldr q19, [x0, #64] -ldr q3, [x0, #0] -mla v21.4S, v10.4S, v31.s[0] -mla v15.4S, v26.4S, v31.s[0] -sub v26.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v10.4s, v13.4s, v8.4s -nop -sqrdmulh v20.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v13.4s, v13.4s, v8.4s -nop -sqrdmulh v8.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v2.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v25.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v29.s[2] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v21.4s -str q11, [x0, #112] -sqrdmulh v11.4S, v26.4S, v29.s[2] -mla v12.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v21.4s -nop -sqrdmulh v21.4S, v18.4S, v29.s[1] -mla v16.4S, v8.4S, v31.s[0] -str q6, [x0, #48] -nop -sqrdmulh v6.4S, v23.4S, v29.s[1] -mla v0.4S, v4.4S, v31.s[0] -nop -nop -mul v26.4S, v26.4S,v30.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v4.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -mla v26.4S, v11.4S, v31.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v12.4s -add v2.4s, v2.4s, v12.4s -mul v23.4S, v23.4S,v30.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -mla v23.4S, v6.4S, v31.s[0] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v14.s[3] -mul v4.4S, v4.4S,v27.s[3] -nop -nop -sqrdmulh v6.4S, v10.4S, v14.s[2] -mul v10.4S, v10.4S,v27.s[2] -sub v16.4s, v25.4s, v5.4s -add v25.4s, v25.4s, v5.4s -sqrdmulh v5.4S, v12.4S, v14.s[1] -mul v12.4S, v12.4S,v27.s[1] -sub v11.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v13.4S, v14.s[0] -mul v13.4S, v13.4S,v27.s[0] -sub v1.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v14.s[3] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v3.4s, v23.4s -add v3.4s, v3.4s, v23.4s -sqrdmulh v23.4S, v2.4S, v14.s[2] -mla v10.4S, v6.4S, v31.s[0] -nop -nop -sqrdmulh v6.4S, v21.4S, v14.s[1] -mla v12.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v22.4S, v14.s[0] -mla v13.4S, v26.4S, v31.s[0] -nop -nop -mul v2.4S, v2.4S,v27.s[2] -mul v15.4S, v15.4S,v27.s[3] -sub v26.4s, v16.4s, v4.4s -add v16.4s, v16.4s, v4.4s -mla v2.4S, v23.4S, v31.s[0] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v10.4s -add v25.4s, v25.4s, v10.4s -mul v22.4S, v22.4S,v27.s[0] -mul v21.4S, v21.4S,v27.s[1] -sub v10.4s, v1.4s, v12.4s -add v1.4s, v1.4s, v12.4s -mla v22.4S, v5.4S, v31.s[0] -mla v21.4S, v6.4S, v31.s[0] -sub v6.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[3] -mul v26.4S, v26.4S,v28.s[3] -nop -nop -sqrdmulh v5.4S, v16.4S, v9.s[2] -mul v16.4S, v16.4S,v28.s[2] -sub v12.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v9.s[1] -mul v18.4S, v18.4S,v28.s[1] -sub v23.4s, v24.4s, v2.4s -add v24.4s, v24.4s, v2.4s -sqrdmulh v2.4S, v25.4S, v9.s[0] -mul v25.4S, v25.4S,v28.s[0] -sub v4.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v7.s[3] -mla v26.4S, v13.4S, v31.s[0] -sub v13.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v7.s[2] -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v26.4s -str q5, [x0, #960] -sqrdmulh v5.4S, v6.4S, v7.s[1] -mla v18.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -str q12, [x0, #896] -sqrdmulh v12.4S, v19.4S, v7.s[0] -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v11.4s, v16.4s -str q2, [x0, #832] -mul v1.4S, v1.4S,v17.s[2] -mul v10.4S, v10.4S,v17.s[3] -add v11.4s, v11.4s, v16.4s -sub v16.4s, v23.4s, v18.4s -mla v1.4S, v22.4S, v31.s[0] -mla v10.4S, v21.4S, v31.s[0] -add v23.4s, v23.4s, v18.4s -str q11, [x0, #768] -mul v19.4S, v19.4S,v17.s[0] -mul v6.4S, v6.4S,v17.s[1] -sub v11.4s, v24.4s, v25.4s -str q16, [x0, #704] -mla v19.4S, v12.4S, v31.s[0] -mla v6.4S, v5.4S, v31.s[0] -add v24.4s, v24.4s, v25.4s -str q23, [x0, #640] -ldr q23, [x0, #976] -sqrdmulh v25.4S, v23.4S, v29.s[0] -mul v23.4S, v23.4S,v30.s[0] -str q11, [x0, #576] -sub v11.4s, v4.4s, v10.4s -ldr q5, [x0, #912] -sqrdmulh v12.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -str q24, [x0, #512] -add v4.4s, v4.4s, v10.4s -ldr q10, [x0, #848] -sqrdmulh v24.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -str q11, [x0, #448] -sub v11.4s, v0.4s, v1.4s -ldr q16, [x0, #784] -sqrdmulh v18.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q4, [x0, #384] -add v0.4s, v0.4s, v1.4s -ldr q1, [x0, #720] -sqrdmulh v4.4S, v1.4S, v29.s[0] -mla v23.4S, v25.4S, v31.s[0] -str q11, [x0, #320] -sub v11.4s, v13.4s, v6.4s -ldr q25, [x0, #656] -sqrdmulh v21.4S, v25.4S, v29.s[0] -mla v5.4S, v12.4S, v31.s[0] -str q0, [x0, #256] -add v13.4s, v13.4s, v6.4s -ldr q6, [x0, #592] -sqrdmulh v0.4S, v6.4S, v29.s[0] -mla v10.4S, v24.4S, v31.s[0] -str q11, [x0, #192] -sub v11.4s, v3.4s, v19.4s -ldr q24, [x0, #528] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v16.4S, v18.4S, v31.s[0] -str q13, [x0, #128] -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #464] -ldr q13, [x0, #400] -mul v25.4S, v25.4S,v30.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v18.4s, v19.4s, v23.4s -add v19.4s, v19.4s, v23.4s -ldr q23, [x0, #336] -ldr q22, [x0, #272] -mla v25.4S, v21.4S, v31.s[0] -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -ldr q5, [x0, #208] -ldr q21, [x0, #144] -mul v24.4S, v24.4S,v30.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v2.4s, v23.4s, v10.4s -add v23.4s, v23.4s, v10.4s -ldr q10, [x0, #80] -ldr q26, [x0, #16] -mla v24.4S, v12.4S, v31.s[0] -mla v6.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v12.4s, v5.4s, v1.4s -nop -sqrdmulh v15.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v5.4s, v5.4s, v1.4s -nop -sqrdmulh v1.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v8.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v20.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v2.4S, v29.s[2] -mla v18.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v24.4s -str q11, [x0, #64] -sqrdmulh v11.4S, v0.4S, v29.s[2] -mla v4.4S, v15.4S, v31.s[0] -add v26.4s, v26.4s, v24.4s -nop -sqrdmulh v24.4S, v23.4S, v29.s[1] -mla v19.4S, v1.4S, v31.s[0] -str q3, [x0, #0] -nop -sqrdmulh v3.4S, v22.4S, v29.s[1] -mla v13.4S, v25.4S, v31.s[0] -nop -nop -mul v0.4S, v0.4S,v30.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v25.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -mla v0.4S, v11.4S, v31.s[0] -mla v2.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -mul v22.4S, v22.4S,v30.s[1] -mul v23.4S, v23.4S,v30.s[1] -sub v4.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -mla v22.4S, v3.4S, v31.s[0] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v29.4S, v25.4S, v14.s[3] -mul v25.4S, v25.4S,v27.s[3] -nop -nop -sqrdmulh v30.4S, v12.4S, v14.s[2] -mul v12.4S, v12.4S,v27.s[2] -sub v13.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v14.s[1] -mul v4.4S, v4.4S,v27.s[1] -sub v3.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v14.s[0] -mul v5.4S, v5.4S,v27.s[0] -sub v19.4s, v10.4s, v23.4s -add v10.4s, v10.4s, v23.4s -sqrdmulh v23.4S, v6.4S, v14.s[3] -mla v25.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v14.s[2] -mla v12.4S, v30.4S, v31.s[0] -nop -nop -sqrdmulh v30.4S, v24.4S, v14.s[1] -mla v4.4S, v2.4S, v31.s[0] -nop -nop -sqrdmulh v2.4S, v21.4S, v14.s[0] -mla v5.4S, v0.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v27.s[2] -mul v6.4S, v6.4S,v27.s[3] -sub v0.4s, v13.4s, v25.4s -add v13.4s, v13.4s, v25.4s -mla v8.4S, v22.4S, v31.s[0] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mul v21.4S, v21.4S,v27.s[0] -mul v24.4S, v24.4S,v27.s[1] -sub v12.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -mla v21.4S, v2.4S, v31.s[0] -mla v24.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v14.4S, v0.4S, v9.s[3] -mul v0.4S, v0.4S,v28.s[3] -nop -nop -sqrdmulh v27.4S, v13.4S, v9.s[2] -mul v13.4S, v13.4S,v28.s[2] -sub v5.4s, v3.4s, v6.4s -add v3.4s, v3.4s, v6.4s -sqrdmulh v6.4S, v23.4S, v9.s[1] -mul v23.4S, v23.4S,v28.s[1] -sub v2.4s, v16.4s, v8.4s -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v20.4S, v9.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v4.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v9.4S, v12.4S, v7.s[3] -mla v0.4S, v14.4S, v31.s[0] -sub v14.4s, v26.4s, v21.4s -add v26.4s, v26.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v7.s[2] -mla v13.4S, v27.4S, v31.s[0] -sub v27.4s, v5.4s, v0.4s -str q27, [x0, #976] -sqrdmulh v27.4S, v30.4S, v7.s[1] -mla v23.4S, v6.4S, v31.s[0] -add v5.4s, v5.4s, v0.4s -str q5, [x0, #912] -sqrdmulh v5.4S, v10.4S, v7.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v3.4s, v13.4s -str q8, [x0, #848] -mul v19.4S, v19.4S,v17.s[2] -mul v12.4S, v12.4S,v17.s[3] -add v3.4s, v3.4s, v13.4s -sub v13.4s, v2.4s, v23.4s -mla v19.4S, v21.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v23.4s -str q3, [x0, #784] -mul v10.4S, v10.4S,v17.s[0] -mul v30.4S, v30.4S,v17.s[1] -sub v3.4s, v16.4s, v20.4s -str q13, [x0, #720] -mla v10.4S, v5.4S, v31.s[0] -mla v30.4S, v27.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -str q2, [x0, #656] -str q3, [x0, #592] -sub v3.4s, v4.4s, v12.4s -str q16, [x0, #528] -add v4.4s, v4.4s, v12.4s -str q3, [x0, #464] -sub v3.4s, v29.4s, v19.4s -str q4, [x0, #400] -add v29.4s, v29.4s, v19.4s -str q3, [x0, #336] -sub v3.4s, v14.4s, v30.4s -str q29, [x0, #272] -add v14.4s, v14.4s, v30.4s -str q3, [x0, #208] -sub v3.4s, v26.4s, v10.4s -str q14, [x0, #144] -add v26.4s, v26.4s, v10.4s -str q3, [x0, #80] -str q26, [x0, #16] -ldr q15, [x17, #+128] -ldr q1, [x17, #+144] -ldr q18, [x17, #+160] -ldr q11, [x17, #+176] -ldr q25, [x17, #+192] -ldr q22, [x17, #+208] -ldr q24, [x17, #+224] -ldr q28, [x17, #+240] -ldr q6, [x0, #32] -ldr q0, [x0, #48] -ldr q8, [x0, #0] -ldr q21, [x0, #16] -sqrdmulh v9.4S, v6.4S, v1.s[0] -mul v6.4S, v6.4S,v15.s[0] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v0.4S, v1.s[0] -mul v0.4S, v0.4S,v15.s[0] -mla v0.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v0.4s -add v21.4s, v21.4s, v0.4s -sqrdmulh v0.4S, v21.4S, v1.s[1] -mul v21.4S, v21.4S,v15.s[1] -mla v21.4S, v0.4S, v31.s[0] -sub v0.4s, v8.4s, v21.4s -add v8.4s, v8.4s, v21.4s -sqrdmulh v21.4S, v6.4S, v1.s[2] -mul v6.4S, v6.4S,v15.s[2] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -trn1 v6.4S, v8.4S, v0.4S -trn2 v23.4S, v8.4S, v0.4S -trn1 v13.4S, v9.4S, v21.4S -trn2 v5.4S, v9.4S, v21.4S -trn2 v9.2D, v6.2D, v13.2D -trn2 v21.2D, v23.2D, v5.2D -trn1 v8.2D, v6.2D, v13.2D -trn1 v0.2D, v23.2D, v5.2D -sqrdmulh v5.4S, v9.4S, v11.4S -mul v9.4S, v9.4S,v18.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v11.4S -mul v21.4S, v21.4S,v18.4S -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v22.4S -mul v0.4S, v0.4S,v25.4S -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v0.4s -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v28.4S -mul v9.4S, v9.4S,v24.4S -mla v9.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -str q8, [x0, #0] -str q21, [x0, #16] -str q5, [x0, #32] -str q0, [x0, #48] -ldr q0, [x17, #+256] -ldr q5, [x17, #+272] -ldr q21, [x17, #+288] -ldr q8, [x17, #+304] -ldr q9, [x17, #+320] -ldr q23, [x17, #+336] -ldr q13, [x17, #+352] -ldr q6, [x17, #+368] -ldr q28, [x0, #96] -ldr q24, [x0, #112] -ldr q22, [x0, #64] -ldr q25, [x0, #80] -sqrdmulh v11.4S, v28.4S, v5.s[0] -mul v28.4S, v28.4S,v0.s[0] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -sqrdmulh v28.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v0.s[0] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v25.4S, v5.s[1] -mul v25.4S, v25.4S,v0.s[1] -mla v25.4S, v24.4S, v31.s[0] -sub v24.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v5.s[2] -mul v28.4S, v28.4S,v0.s[2] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v11.4s, v28.4s -add v11.4s, v11.4s, v28.4s -trn1 v28.4S, v22.4S, v24.4S -trn2 v18.4S, v22.4S, v24.4S -trn1 v1.4S, v11.4S, v25.4S -trn2 v15.4S, v11.4S, v25.4S -trn2 v11.2D, v28.2D, v1.2D -trn2 v25.2D, v18.2D, v15.2D -trn1 v22.2D, v28.2D, v1.2D -trn1 v24.2D, v18.2D, v15.2D -sqrdmulh v15.4S, v11.4S, v8.4S -mul v11.4S, v11.4S,v21.4S -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v25.4S, v8.4S -mul v25.4S, v25.4S,v21.4S -mla v25.4S, v11.4S, v31.s[0] -sub v11.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v23.4S -mul v24.4S, v24.4S,v9.4S -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v22.4s, v24.4s -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v11.4S, v6.4S -mul v11.4S, v11.4S,v13.4S -mla v11.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -str q22, [x0, #64] -str q25, [x0, #80] -str q15, [x0, #96] -str q24, [x0, #112] -ldr q24, [x17, #+384] -ldr q15, [x17, #+400] -ldr q25, [x17, #+416] -ldr q22, [x17, #+432] -ldr q11, [x17, #+448] -ldr q18, [x17, #+464] -ldr q1, [x17, #+480] -ldr q28, [x17, #+496] -ldr q6, [x0, #160] -ldr q13, [x0, #176] -ldr q23, [x0, #128] -ldr q9, [x0, #144] -sqrdmulh v8.4S, v6.4S, v15.s[0] -mul v6.4S, v6.4S,v24.s[0] -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v6.4s -add v23.4s, v23.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v24.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v15.s[1] -mul v9.4S, v9.4S,v24.s[1] -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v23.4s, v9.4s -add v23.4s, v23.4s, v9.4s -sqrdmulh v9.4S, v6.4S, v15.s[2] -mul v6.4S, v6.4S,v24.s[2] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -trn1 v6.4S, v23.4S, v13.4S -trn2 v21.4S, v23.4S, v13.4S -trn1 v5.4S, v8.4S, v9.4S -trn2 v0.4S, v8.4S, v9.4S -trn2 v8.2D, v6.2D, v5.2D -trn2 v9.2D, v21.2D, v0.2D -trn1 v23.2D, v6.2D, v5.2D -trn1 v13.2D, v21.2D, v0.2D -sqrdmulh v0.4S, v8.4S, v22.4S -mul v8.4S, v8.4S,v25.4S -mla v8.4S, v0.4S, v31.s[0] -sub v0.4s, v23.4s, v8.4s -add v23.4s, v23.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v22.4S -mul v9.4S, v9.4S,v25.4S -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v13.4s, v9.4s -add v13.4s, v13.4s, v9.4s -sqrdmulh v9.4S, v13.4S, v18.4S -mul v13.4S, v13.4S,v11.4S -mla v13.4S, v9.4S, v31.s[0] -sub v9.4s, v23.4s, v13.4s -add v23.4s, v23.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v28.4S -mul v8.4S, v8.4S,v1.4S -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -str q23, [x0, #128] -str q9, [x0, #144] -str q0, [x0, #160] -str q13, [x0, #176] -ldr q13, [x17, #+512] -ldr q0, [x17, #+528] -ldr q9, [x17, #+544] -ldr q23, [x17, #+560] -ldr q8, [x17, #+576] -ldr q21, [x17, #+592] -ldr q5, [x17, #+608] -ldr q6, [x17, #+624] -ldr q28, [x0, #224] -ldr q1, [x0, #240] -ldr q18, [x0, #192] -ldr q11, [x0, #208] -sqrdmulh v22.4S, v28.4S, v0.s[0] -mul v28.4S, v28.4S,v13.s[0] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v28.4s -add v18.4s, v18.4s, v28.4s -sqrdmulh v28.4S, v1.4S, v0.s[0] -mul v1.4S, v1.4S,v13.s[0] -mla v1.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -sqrdmulh v1.4S, v11.4S, v0.s[1] -mul v11.4S, v11.4S,v13.s[1] -mla v11.4S, v1.4S, v31.s[0] -sub v1.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v0.s[2] -mul v28.4S, v28.4S,v13.s[2] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -trn1 v28.4S, v18.4S, v1.4S -trn2 v25.4S, v18.4S, v1.4S -trn1 v15.4S, v22.4S, v11.4S -trn2 v24.4S, v22.4S, v11.4S -trn2 v22.2D, v28.2D, v15.2D -trn2 v11.2D, v25.2D, v24.2D -trn1 v18.2D, v28.2D, v15.2D -trn1 v1.2D, v25.2D, v24.2D -sqrdmulh v24.4S, v22.4S, v23.4S -mul v22.4S, v22.4S,v9.4S -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v23.4S -mul v11.4S, v11.4S,v9.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v21.4S -mul v1.4S, v1.4S,v8.4S -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v1.4s -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v5.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v24.4s, v22.4s -add v24.4s, v24.4s, v22.4s -str q18, [x0, #192] -str q11, [x0, #208] -str q24, [x0, #224] -str q1, [x0, #240] -ldr q1, [x17, #+640] -ldr q24, [x17, #+656] -ldr q11, [x17, #+672] -ldr q18, [x17, #+688] -ldr q22, [x17, #+704] -ldr q25, [x17, #+720] -ldr q15, [x17, #+736] -ldr q28, [x17, #+752] -ldr q6, [x0, #288] -ldr q5, [x0, #304] -ldr q21, [x0, #256] -ldr q8, [x0, #272] -sqrdmulh v23.4S, v6.4S, v24.s[0] -mul v6.4S, v6.4S,v1.s[0] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v24.s[0] -mul v5.4S, v5.4S,v1.s[0] -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v5.4s -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v24.s[1] -mul v8.4S, v8.4S,v1.s[1] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v24.s[2] -mul v6.4S, v6.4S,v1.s[2] -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v6.4s -add v23.4s, v23.4s, v6.4s -trn1 v6.4S, v21.4S, v5.4S -trn2 v9.4S, v21.4S, v5.4S -trn1 v0.4S, v23.4S, v8.4S -trn2 v13.4S, v23.4S, v8.4S -trn2 v23.2D, v6.2D, v0.2D -trn2 v8.2D, v9.2D, v13.2D -trn1 v21.2D, v6.2D, v0.2D -trn1 v5.2D, v9.2D, v13.2D -sqrdmulh v13.4S, v23.4S, v18.4S -mul v23.4S, v23.4S,v11.4S -mla v23.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v23.4s -add v21.4s, v21.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v18.4S -mul v8.4S, v8.4S,v11.4S -mla v8.4S, v23.4S, v31.s[0] -sub v23.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v25.4S -mul v5.4S, v5.4S,v22.4S -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v23.4S, v28.4S -mul v23.4S, v23.4S,v15.4S -mla v23.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v23.4s -add v13.4s, v13.4s, v23.4s -str q21, [x0, #256] -str q8, [x0, #272] -str q13, [x0, #288] -str q5, [x0, #304] -ldr q5, [x17, #+768] -ldr q13, [x17, #+784] -ldr q8, [x17, #+800] -ldr q21, [x17, #+816] -ldr q23, [x17, #+832] -ldr q9, [x17, #+848] -ldr q0, [x17, #+864] -ldr q6, [x17, #+880] -ldr q28, [x0, #352] -ldr q15, [x0, #368] -ldr q25, [x0, #320] -ldr q22, [x0, #336] -sqrdmulh v18.4S, v28.4S, v13.s[0] -mul v28.4S, v28.4S,v5.s[0] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v13.s[0] -mul v15.4S, v15.4S,v5.s[0] -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v13.s[1] -mul v22.4S, v22.4S,v5.s[1] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v25.4s, v22.4s -add v25.4s, v25.4s, v22.4s -sqrdmulh v22.4S, v28.4S, v13.s[2] -mul v28.4S, v28.4S,v5.s[2] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v28.4s -add v18.4s, v18.4s, v28.4s -trn1 v28.4S, v25.4S, v15.4S -trn2 v11.4S, v25.4S, v15.4S -trn1 v24.4S, v18.4S, v22.4S -trn2 v1.4S, v18.4S, v22.4S -trn2 v18.2D, v28.2D, v24.2D -trn2 v22.2D, v11.2D, v1.2D -trn1 v25.2D, v28.2D, v24.2D -trn1 v15.2D, v11.2D, v1.2D -sqrdmulh v1.4S, v18.4S, v21.4S -mul v18.4S, v18.4S,v8.4S -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v25.4s, v18.4s -add v25.4s, v25.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v21.4S -mul v22.4S, v22.4S,v8.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v22.4s -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v15.4S, v9.4S -mul v15.4S, v15.4S,v23.4S -mla v15.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v15.4s -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v6.4S -mul v18.4S, v18.4S,v0.4S -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -str q25, [x0, #320] -str q22, [x0, #336] -str q1, [x0, #352] -str q15, [x0, #368] -ldr q15, [x17, #+896] -ldr q1, [x17, #+912] -ldr q22, [x17, #+928] -ldr q25, [x17, #+944] -ldr q18, [x17, #+960] -ldr q11, [x17, #+976] -ldr q24, [x17, #+992] -ldr q28, [x17, #+1008] -ldr q6, [x0, #416] -ldr q0, [x0, #432] -ldr q9, [x0, #384] -ldr q23, [x0, #400] -sqrdmulh v21.4S, v6.4S, v1.s[0] -mul v6.4S, v6.4S,v15.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -sqrdmulh v6.4S, v0.4S, v1.s[0] -mul v0.4S, v0.4S,v15.s[0] -mla v0.4S, v6.4S, v31.s[0] -sub v6.4s, v23.4s, v0.4s -add v23.4s, v23.4s, v0.4s -sqrdmulh v0.4S, v23.4S, v1.s[1] -mul v23.4S, v23.4S,v15.s[1] -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v6.4S, v1.s[2] -mul v6.4S, v6.4S,v15.s[2] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -trn1 v6.4S, v9.4S, v0.4S -trn2 v8.4S, v9.4S, v0.4S -trn1 v13.4S, v21.4S, v23.4S -trn2 v5.4S, v21.4S, v23.4S -trn2 v21.2D, v6.2D, v13.2D -trn2 v23.2D, v8.2D, v5.2D -trn1 v9.2D, v6.2D, v13.2D -trn1 v0.2D, v8.2D, v5.2D -sqrdmulh v5.4S, v21.4S, v25.4S -mul v21.4S, v21.4S,v22.4S -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -sqrdmulh v21.4S, v23.4S, v25.4S -mul v23.4S, v23.4S,v22.4S -mla v23.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v23.4s -add v0.4s, v0.4s, v23.4s -sqrdmulh v23.4S, v0.4S, v11.4S -mul v0.4S, v0.4S,v18.4S -mla v0.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v21.4S, v28.4S -mul v21.4S, v21.4S,v24.4S -mla v21.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -str q9, [x0, #384] -str q23, [x0, #400] -str q5, [x0, #416] -str q0, [x0, #432] -ldr q0, [x17, #+1024] -ldr q5, [x17, #+1040] -ldr q23, [x17, #+1056] -ldr q9, [x17, #+1072] -ldr q21, [x17, #+1088] -ldr q8, [x17, #+1104] -ldr q13, [x17, #+1120] -ldr q6, [x17, #+1136] -ldr q28, [x0, #480] -ldr q24, [x0, #496] -ldr q11, [x0, #448] -ldr q18, [x0, #464] -sqrdmulh v25.4S, v28.4S, v5.s[0] -mul v28.4S, v28.4S,v0.s[0] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v11.4s, v28.4s -add v11.4s, v11.4s, v28.4s -sqrdmulh v28.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v0.s[0] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v24.4s -add v18.4s, v18.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v5.s[1] -mul v18.4S, v18.4S,v0.s[1] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v28.4S, v5.s[2] -mul v28.4S, v28.4S,v0.s[2] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -trn1 v28.4S, v11.4S, v24.4S -trn2 v22.4S, v11.4S, v24.4S -trn1 v1.4S, v25.4S, v18.4S -trn2 v15.4S, v25.4S, v18.4S -trn2 v25.2D, v28.2D, v1.2D -trn2 v18.2D, v22.2D, v15.2D -trn1 v11.2D, v28.2D, v1.2D -trn1 v24.2D, v22.2D, v15.2D -sqrdmulh v15.4S, v25.4S, v9.4S -mul v25.4S, v25.4S,v23.4S -mla v25.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v25.4s -add v11.4s, v11.4s, v25.4s -sqrdmulh v25.4S, v18.4S, v9.4S -mul v18.4S, v18.4S,v23.4S -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v24.4s, v18.4s -add v24.4s, v24.4s, v18.4s -sqrdmulh v18.4S, v24.4S, v8.4S -mul v24.4S, v24.4S,v21.4S -mla v24.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -sqrdmulh v24.4S, v25.4S, v6.4S -mul v25.4S, v25.4S,v13.4S -mla v25.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v25.4s -add v15.4s, v15.4s, v25.4s -str q11, [x0, #448] -str q18, [x0, #464] -str q15, [x0, #480] -str q24, [x0, #496] -ldr q24, [x17, #+1152] -ldr q15, [x17, #+1168] -ldr q18, [x17, #+1184] -ldr q11, [x17, #+1200] -ldr q25, [x17, #+1216] -ldr q22, [x17, #+1232] -ldr q1, [x17, #+1248] -ldr q28, [x17, #+1264] -ldr q6, [x0, #544] -ldr q13, [x0, #560] -ldr q8, [x0, #512] -ldr q21, [x0, #528] -sqrdmulh v9.4S, v6.4S, v15.s[0] -mul v6.4S, v6.4S,v24.s[0] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v24.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v15.s[1] -mul v21.4S, v21.4S,v24.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v21.4s -add v8.4s, v8.4s, v21.4s -sqrdmulh v21.4S, v6.4S, v15.s[2] -mul v6.4S, v6.4S,v24.s[2] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -trn1 v6.4S, v8.4S, v13.4S -trn2 v23.4S, v8.4S, v13.4S -trn1 v5.4S, v9.4S, v21.4S -trn2 v0.4S, v9.4S, v21.4S -trn2 v9.2D, v6.2D, v5.2D -trn2 v21.2D, v23.2D, v0.2D -trn1 v8.2D, v6.2D, v5.2D -trn1 v13.2D, v23.2D, v0.2D -sqrdmulh v0.4S, v9.4S, v11.4S -mul v9.4S, v9.4S,v18.4S -mla v9.4S, v0.4S, v31.s[0] -sub v0.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v11.4S -mul v21.4S, v21.4S,v18.4S -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v22.4S -mul v13.4S, v13.4S,v25.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v28.4S -mul v9.4S, v9.4S,v1.4S -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v9.4s -add v0.4s, v0.4s, v9.4s -str q8, [x0, #512] -str q21, [x0, #528] -str q0, [x0, #544] -str q13, [x0, #560] -ldr q13, [x17, #+1280] -ldr q0, [x17, #+1296] -ldr q21, [x17, #+1312] -ldr q8, [x17, #+1328] -ldr q9, [x17, #+1344] -ldr q23, [x17, #+1360] -ldr q5, [x17, #+1376] -ldr q6, [x17, #+1392] -ldr q28, [x0, #608] -ldr q1, [x0, #624] -ldr q22, [x0, #576] -ldr q25, [x0, #592] -sqrdmulh v11.4S, v28.4S, v0.s[0] -mul v28.4S, v28.4S,v13.s[0] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -sqrdmulh v28.4S, v1.4S, v0.s[0] -mul v1.4S, v1.4S,v13.s[0] -mla v1.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v1.4s -add v25.4s, v25.4s, v1.4s -sqrdmulh v1.4S, v25.4S, v0.s[1] -mul v25.4S, v25.4S,v13.s[1] -mla v25.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v0.s[2] -mul v28.4S, v28.4S,v13.s[2] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v11.4s, v28.4s -add v11.4s, v11.4s, v28.4s -trn1 v28.4S, v22.4S, v1.4S -trn2 v18.4S, v22.4S, v1.4S -trn1 v15.4S, v11.4S, v25.4S -trn2 v24.4S, v11.4S, v25.4S -trn2 v11.2D, v28.2D, v15.2D -trn2 v25.2D, v18.2D, v24.2D -trn1 v22.2D, v28.2D, v15.2D -trn1 v1.2D, v18.2D, v24.2D -sqrdmulh v24.4S, v11.4S, v8.4S -mul v11.4S, v11.4S,v21.4S -mla v11.4S, v24.4S, v31.s[0] -sub v24.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v25.4S, v8.4S -mul v25.4S, v25.4S,v21.4S -mla v25.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v25.4s -add v1.4s, v1.4s, v25.4s -sqrdmulh v25.4S, v1.4S, v23.4S -mul v1.4S, v1.4S,v9.4S -mla v1.4S, v25.4S, v31.s[0] -sub v25.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v11.4S, v6.4S -mul v11.4S, v11.4S,v5.4S -mla v11.4S, v1.4S, v31.s[0] -sub v1.4s, v24.4s, v11.4s -add v24.4s, v24.4s, v11.4s -str q22, [x0, #576] -str q25, [x0, #592] -str q24, [x0, #608] -str q1, [x0, #624] -ldr q1, [x17, #+1408] -ldr q24, [x17, #+1424] -ldr q25, [x17, #+1440] -ldr q22, [x17, #+1456] -ldr q11, [x17, #+1472] -ldr q18, [x17, #+1488] -ldr q15, [x17, #+1504] -ldr q28, [x17, #+1520] -ldr q6, [x0, #672] -ldr q5, [x0, #688] -ldr q23, [x0, #640] -ldr q9, [x0, #656] -sqrdmulh v8.4S, v6.4S, v24.s[0] -mul v6.4S, v6.4S,v1.s[0] -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v6.4s -add v23.4s, v23.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v24.s[0] -mul v5.4S, v5.4S,v1.s[0] -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v24.s[1] -mul v9.4S, v9.4S,v1.s[1] -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v23.4s, v9.4s -add v23.4s, v23.4s, v9.4s -sqrdmulh v9.4S, v6.4S, v24.s[2] -mul v6.4S, v6.4S,v1.s[2] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -trn1 v6.4S, v23.4S, v5.4S -trn2 v21.4S, v23.4S, v5.4S -trn1 v0.4S, v8.4S, v9.4S -trn2 v13.4S, v8.4S, v9.4S -trn2 v8.2D, v6.2D, v0.2D -trn2 v9.2D, v21.2D, v13.2D -trn1 v23.2D, v6.2D, v0.2D -trn1 v5.2D, v21.2D, v13.2D -sqrdmulh v13.4S, v8.4S, v22.4S -mul v8.4S, v8.4S,v25.4S -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v23.4s, v8.4s -add v23.4s, v23.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v22.4S -mul v9.4S, v9.4S,v25.4S -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -sqrdmulh v9.4S, v5.4S, v18.4S -mul v5.4S, v5.4S,v11.4S -mla v5.4S, v9.4S, v31.s[0] -sub v9.4s, v23.4s, v5.4s -add v23.4s, v23.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v28.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v8.4s -add v13.4s, v13.4s, v8.4s -str q23, [x0, #640] -str q9, [x0, #656] -str q13, [x0, #672] -str q5, [x0, #688] -ldr q5, [x17, #+1536] -ldr q13, [x17, #+1552] -ldr q9, [x17, #+1568] -ldr q23, [x17, #+1584] -ldr q8, [x17, #+1600] -ldr q21, [x17, #+1616] -ldr q0, [x17, #+1632] -ldr q6, [x17, #+1648] -ldr q28, [x0, #736] -ldr q15, [x0, #752] -ldr q18, [x0, #704] -ldr q11, [x0, #720] -sqrdmulh v22.4S, v28.4S, v13.s[0] -mul v28.4S, v28.4S,v5.s[0] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v28.4s -add v18.4s, v18.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v13.s[0] -mul v15.4S, v15.4S,v5.s[0] -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v13.s[1] -mul v11.4S, v11.4S,v5.s[1] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v13.s[2] -mul v28.4S, v28.4S,v5.s[2] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -trn1 v28.4S, v18.4S, v15.4S -trn2 v25.4S, v18.4S, v15.4S -trn1 v24.4S, v22.4S, v11.4S -trn2 v1.4S, v22.4S, v11.4S -trn2 v22.2D, v28.2D, v24.2D -trn2 v11.2D, v25.2D, v1.2D -trn1 v18.2D, v28.2D, v24.2D -trn1 v15.2D, v25.2D, v1.2D -sqrdmulh v1.4S, v22.4S, v23.4S -mul v22.4S, v22.4S,v9.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v23.4S -mul v11.4S, v11.4S,v9.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v21.4S -mul v15.4S, v15.4S,v8.4S -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -str q18, [x0, #704] -str q11, [x0, #720] -str q1, [x0, #736] -str q15, [x0, #752] -ldr q15, [x17, #+1664] -ldr q1, [x17, #+1680] -ldr q11, [x17, #+1696] -ldr q18, [x17, #+1712] -ldr q22, [x17, #+1728] -ldr q25, [x17, #+1744] -ldr q24, [x17, #+1760] -ldr q28, [x17, #+1776] -ldr q6, [x0, #800] -ldr q0, [x0, #816] -ldr q21, [x0, #768] -ldr q8, [x0, #784] -sqrdmulh v23.4S, v6.4S, v1.s[0] -mul v6.4S, v6.4S,v15.s[0] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v0.4S, v1.s[0] -mul v0.4S, v0.4S,v15.s[0] -mla v0.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v0.4s -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v8.4S, v1.s[1] -mul v8.4S, v8.4S,v15.s[1] -mla v8.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v1.s[2] -mul v6.4S, v6.4S,v15.s[2] -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v6.4s -add v23.4s, v23.4s, v6.4s -trn1 v6.4S, v21.4S, v0.4S -trn2 v9.4S, v21.4S, v0.4S -trn1 v13.4S, v23.4S, v8.4S -trn2 v5.4S, v23.4S, v8.4S -trn2 v23.2D, v6.2D, v13.2D -trn2 v8.2D, v9.2D, v5.2D -trn1 v21.2D, v6.2D, v13.2D -trn1 v0.2D, v9.2D, v5.2D -sqrdmulh v5.4S, v23.4S, v18.4S -mul v23.4S, v23.4S,v11.4S -mla v23.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v23.4s -add v21.4s, v21.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v18.4S -mul v8.4S, v8.4S,v11.4S -mla v8.4S, v23.4S, v31.s[0] -sub v23.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v25.4S -mul v0.4S, v0.4S,v22.4S -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v21.4s, v0.4s -add v21.4s, v21.4s, v0.4s -sqrdmulh v0.4S, v23.4S, v28.4S -mul v23.4S, v23.4S,v24.4S -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v23.4s -add v5.4s, v5.4s, v23.4s -str q21, [x0, #768] -str q8, [x0, #784] -str q5, [x0, #800] -str q0, [x0, #816] -ldr q0, [x17, #+1792] -ldr q5, [x17, #+1808] -ldr q8, [x17, #+1824] -ldr q21, [x17, #+1840] -ldr q23, [x17, #+1856] -ldr q9, [x17, #+1872] -ldr q13, [x17, #+1888] -ldr q6, [x17, #+1904] -ldr q28, [x0, #864] -ldr q24, [x0, #880] -ldr q25, [x0, #832] -ldr q22, [x0, #848] -sqrdmulh v18.4S, v28.4S, v5.s[0] -mul v28.4S, v28.4S,v0.s[0] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v0.s[0] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v24.4s -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v22.4S, v5.s[1] -mul v22.4S, v22.4S,v0.s[1] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v22.4s -add v25.4s, v25.4s, v22.4s -sqrdmulh v22.4S, v28.4S, v5.s[2] -mul v28.4S, v28.4S,v0.s[2] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v28.4s -add v18.4s, v18.4s, v28.4s -trn1 v28.4S, v25.4S, v24.4S -trn2 v11.4S, v25.4S, v24.4S -trn1 v1.4S, v18.4S, v22.4S -trn2 v15.4S, v18.4S, v22.4S -trn2 v18.2D, v28.2D, v1.2D -trn2 v22.2D, v11.2D, v15.2D -trn1 v25.2D, v28.2D, v1.2D -trn1 v24.2D, v11.2D, v15.2D -sqrdmulh v15.4S, v18.4S, v21.4S -mul v18.4S, v18.4S,v8.4S -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v25.4s, v18.4s -add v25.4s, v25.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v21.4S -mul v22.4S, v22.4S,v8.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v24.4s, v22.4s -add v24.4s, v24.4s, v22.4s -sqrdmulh v22.4S, v24.4S, v9.4S -mul v24.4S, v24.4S,v23.4S -mla v24.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v6.4S -mul v18.4S, v18.4S,v13.4S -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -str q25, [x0, #832] -str q22, [x0, #848] -str q15, [x0, #864] -str q24, [x0, #880] -ldr q24, [x17, #+1920] -ldr q15, [x17, #+1936] -ldr q22, [x17, #+1952] -ldr q25, [x17, #+1968] -ldr q18, [x17, #+1984] -ldr q11, [x17, #+2000] -ldr q1, [x17, #+2016] -ldr q28, [x17, #+2032] -ldr q6, [x0, #928] -ldr q13, [x0, #944] -ldr q9, [x0, #896] -ldr q23, [x0, #912] -sqrdmulh v21.4S, v6.4S, v15.s[0] -mul v6.4S, v6.4S,v24.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v24.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v23.4s, v13.4s -add v23.4s, v23.4s, v13.4s -sqrdmulh v13.4S, v23.4S, v15.s[1] -mul v23.4S, v23.4S,v24.s[1] -mla v23.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v6.4S, v15.s[2] -mul v6.4S, v6.4S,v24.s[2] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -trn1 v6.4S, v9.4S, v13.4S -trn2 v8.4S, v9.4S, v13.4S -trn1 v5.4S, v21.4S, v23.4S -trn2 v0.4S, v21.4S, v23.4S -trn2 v21.2D, v6.2D, v5.2D -trn2 v23.2D, v8.2D, v0.2D -trn1 v9.2D, v6.2D, v5.2D -trn1 v13.2D, v8.2D, v0.2D -sqrdmulh v0.4S, v21.4S, v25.4S -mul v21.4S, v21.4S,v22.4S -mla v21.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -sqrdmulh v21.4S, v23.4S, v25.4S -mul v23.4S, v23.4S,v22.4S -mla v23.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v23.4s -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v13.4S, v11.4S -mul v13.4S, v13.4S,v18.4S -mla v13.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v28.4S -mul v21.4S, v21.4S,v1.4S -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -str q9, [x0, #896] -str q23, [x0, #912] -str q0, [x0, #928] -str q13, [x0, #944] -ldr q13, [x17, #+2048] -ldr q0, [x17, #+2064] -ldr q23, [x17, #+2080] -ldr q9, [x17, #+2096] -ldr q21, [x17, #+2112] -ldr q8, [x17, #+2128] -ldr q5, [x17, #+2144] -ldr q6, [x17, #+2160] -ldr q28, [x0, #992] -ldr q1, [x0, #1008] -ldr q11, [x0, #960] -ldr q18, [x0, #976] -sqrdmulh v25.4S, v28.4S, v0.s[0] -mul v28.4S, v28.4S,v13.s[0] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v11.4s, v28.4s -add v11.4s, v11.4s, v28.4s -sqrdmulh v28.4S, v1.4S, v0.s[0] -mul v1.4S, v1.4S,v13.s[0] -mla v1.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v1.4s -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v0.s[1] -mul v18.4S, v18.4S,v13.s[1] -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v28.4S, v0.s[2] -mul v28.4S, v28.4S,v13.s[2] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -trn1 v28.4S, v11.4S, v1.4S -trn2 v22.4S, v11.4S, v1.4S -trn1 v15.4S, v25.4S, v18.4S -trn2 v24.4S, v25.4S, v18.4S -trn2 v25.2D, v28.2D, v15.2D -trn2 v18.2D, v22.2D, v24.2D -trn1 v11.2D, v28.2D, v15.2D -trn1 v1.2D, v22.2D, v24.2D -sqrdmulh v24.4S, v25.4S, v9.4S -mul v25.4S, v25.4S,v23.4S -mla v25.4S, v24.4S, v31.s[0] -sub v24.4s, v11.4s, v25.4s -add v11.4s, v11.4s, v25.4s -sqrdmulh v25.4S, v18.4S, v9.4S -mul v18.4S, v18.4S,v23.4S -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v1.4S, v8.4S -mul v1.4S, v1.4S,v21.4S -mla v1.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -sqrdmulh v1.4S, v25.4S, v6.4S -mul v25.4S, v25.4S,v5.4S -mla v25.4S, v1.4S, v31.s[0] -sub v1.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -str q11, [x0, #960] -str q18, [x0, #976] -str q24, [x0, #992] -str q1, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2456 -// Instruction count: 2452 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s deleted file mode 100644 index d6c8d9d..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_1_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_1_0 -.global _ntt_u32_full_neon_asm_var_4_4_1_0 -ntt_u32_full_neon_asm_var_4_4_1_0: -_ntt_u32_full_neon_asm_var_4_4_1_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -ldr q2, [x0, #544] -ldr q1, [x0, #608] -ldr q0, [x0, #672] -ldr q15, [x0, #736] -ldr q14, [x0, #32] -ldr q13, [x0, #96] -ldr q12, [x0, #160] -ldr q11, [x0, #224] -sqrdmulh v10.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sqrdmulh v9.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v22.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v20.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v1.4S, v20.4S, v31.s[0] -sub v20.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v0.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v16.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v9.4s -add v20.4s, v20.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -mla v11.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v2.4S, v11.4S, v31.s[0] -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v25.s[0] -mul v13.4S, v13.4S,v26.s[0] -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v23.s[2] -mul v21.4S, v21.4S,v24.s[2] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v23.s[3] -mul v1.4S, v1.4S,v24.s[3] -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -mla v1.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sub v21.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -str q14, [x0, #32] -str q18, [x0, #96] -str q9, [x0, #160] -str q17, [x0, #224] -str q16, [x0, #288] -str q13, [x0, #352] -str q12, [x0, #416] -str q10, [x0, #480] -str q20, [x0, #544] -str q3, [x0, #608] -str q0, [x0, #672] -str q11, [x0, #736] -str q22, [x0, #800] -str q19, [x0, #864] -str q2, [x0, #928] -str q21, [x0, #992] -ldr q21, [x0, #816] -ldr q2, [x0, #880] -ldr q19, [x0, #944] -ldr q22, [x0, #1008] -ldr q11, [x0, #304] -ldr q0, [x0, #368] -ldr q3, [x0, #432] -ldr q20, [x0, #496] -ldr q10, [x0, #560] -ldr q12, [x0, #624] -ldr q13, [x0, #688] -ldr q16, [x0, #752] -ldr q17, [x0, #48] -ldr q9, [x0, #112] -ldr q18, [x0, #176] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v21.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v10.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v22.4s -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v17.4s, v10.4s -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v16.4s -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -mla v11.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v2.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v18.4S, v27.s[0] -mul v18.4S, v18.4S,v28.s[0] -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v18.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v27.s[1] -mul v13.4S, v13.4S,v28.s[1] -mla v14.4S, v15.4S, v31.s[0] -sub v15.4s, v17.4s, v18.4s -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v9.4s, v14.4s -add v9.4s, v9.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v13.4s -add v3.4s, v3.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[3] -mul v0.4S, v0.4S,v28.s[3] -mla v11.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v25.s[0] -mul v9.4S, v9.4S,v26.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v25.s[1] -mul v1.4S, v1.4S,v26.s[1] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v25.s[2] -mul v20.4S, v20.4S,v26.s[2] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v14.4S, v25.s[3] -mul v14.4S, v14.4S,v26.s[3] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v23.s[0] -mul v22.4S, v22.4S,v24.s[0] -mla v14.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v23.s[1] -mul v16.4S, v16.4S,v24.s[1] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v23.s[2] -mul v2.4S, v2.4S,v24.s[2] -mla v16.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v23.s[3] -mul v12.4S, v12.4S,v24.s[3] -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sub v2.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -str q17, [x0, #48] -str q11, [x0, #112] -str q15, [x0, #176] -str q0, [x0, #240] -str q3, [x0, #304] -str q9, [x0, #368] -str q18, [x0, #432] -str q1, [x0, #496] -str q19, [x0, #560] -str q20, [x0, #624] -str q13, [x0, #688] -str q14, [x0, #752] -str q21, [x0, #816] -str q22, [x0, #880] -str q10, [x0, #944] -str q2, [x0, #1008] -ldr q2, [x0, #768] -ldr q10, [x0, #832] -ldr q22, [x0, #896] -ldr q21, [x0, #960] -ldr q14, [x0, #256] -ldr q13, [x0, #320] -ldr q20, [x0, #384] -ldr q19, [x0, #448] -ldr q1, [x0, #512] -ldr q18, [x0, #576] -ldr q9, [x0, #640] -ldr q3, [x0, #704] -ldr q0, [x0, #0] -ldr q15, [x0, #64] -ldr q11, [x0, #128] -ldr q17, [x0, #192] -sqrdmulh v12.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sqrdmulh v16.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v2.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v10.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v21.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v22.4s -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v1.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v1.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v3.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v9.4s -add v11.4s, v11.4s, v9.4s -sqrdmulh v9.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -mla v19.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v15.4s, v13.4s -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v16.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -mla v11.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v27.s[1] -mul v9.4S, v9.4S,v28.s[1] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v3.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v9.4s -add v20.4s, v20.4s, v9.4s -sqrdmulh v9.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v27.s[3] -mul v13.4S, v13.4S,v28.s[3] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v25.s[2] -mul v19.4S, v19.4S,v26.s[2] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -sqrdmulh v15.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v23.s[0] -mul v21.4S, v21.4S,v24.s[0] -mla v17.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v23.s[1] -mul v3.4S, v3.4S,v24.s[1] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v23.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v23.s[3] -mul v18.4S, v18.4S,v24.s[3] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sub v10.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -str q0, [x0, #0] -str q14, [x0, #64] -str q16, [x0, #128] -str q13, [x0, #192] -str q20, [x0, #256] -str q15, [x0, #320] -str q11, [x0, #384] -str q12, [x0, #448] -str q22, [x0, #512] -str q19, [x0, #576] -str q9, [x0, #640] -str q17, [x0, #704] -str q2, [x0, #768] -str q21, [x0, #832] -str q1, [x0, #896] -str q10, [x0, #960] -ldr q10, [x0, #784] -ldr q1, [x0, #848] -ldr q21, [x0, #912] -ldr q2, [x0, #976] -ldr q17, [x0, #272] -ldr q9, [x0, #336] -ldr q19, [x0, #400] -ldr q22, [x0, #464] -ldr q12, [x0, #528] -ldr q11, [x0, #592] -ldr q15, [x0, #656] -ldr q20, [x0, #720] -ldr q13, [x0, #16] -ldr q16, [x0, #80] -ldr q14, [x0, #144] -ldr q0, [x0, #208] -sqrdmulh v18.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sqrdmulh v3.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v10.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v1.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v10.4s -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v1.4s -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v11.4s -add v16.4s, v16.4s, v11.4s -sqrdmulh v11.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v19.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v10.4s -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -sqrdmulh v1.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v18.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v14.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -sqrdmulh v14.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v11.4s -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[0] -mul v16.4S, v16.4S,v26.s[0] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v25.s[1] -mul v18.4S, v18.4S,v26.s[1] -mla v16.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -sqrdmulh v9.4S, v22.4S, v25.s[2] -mul v22.4S, v22.4S,v26.s[2] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v25.s[3] -mul v0.4S, v0.4S,v26.s[3] -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v23.s[1] -mul v20.4S, v20.4S,v24.s[1] -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v23.s[2] -mul v1.4S, v1.4S,v24.s[2] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v11.4S, v23.s[3] -mul v11.4S, v11.4S,v24.s[3] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -sub v1.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -str q13, [x0, #16] -str q17, [x0, #80] -str q3, [x0, #144] -str q9, [x0, #208] -str q19, [x0, #272] -str q16, [x0, #336] -str q14, [x0, #400] -str q18, [x0, #464] -str q21, [x0, #528] -str q22, [x0, #592] -str q15, [x0, #656] -str q0, [x0, #720] -str q10, [x0, #784] -str q2, [x0, #848] -str q12, [x0, #912] -str q1, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q20, [x17, #+208] -ldr q11, [x17, #+224] -ldr q13, [x17, #+240] -ldr q17, [x0, #32] -ldr q3, [x0, #48] -ldr q9, [x0, #0] -ldr q19, [x0, #16] -sqrdmulh v16.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v5.s[0] -mul v3.4S, v3.4S,v4.s[0] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v5.s[1] -mul v19.4S, v19.4S,v4.s[1] -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v9.4s, v19.4s -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -trn1 v17.4S, v9.4S, v3.4S -trn2 v14.4S, v9.4S, v3.4S -trn1 v18.4S, v16.4S, v19.4S -trn2 v21.4S, v16.4S, v19.4S -trn2 v16.2D, v17.2D, v18.2D -trn2 v19.2D, v14.2D, v21.2D -trn1 v9.2D, v17.2D, v18.2D -trn1 v3.2D, v14.2D, v21.2D -sqrdmulh v21.4S, v16.4S, v7.4S -mul v16.4S, v16.4S,v6.4S -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v7.4S -mul v19.4S, v19.4S,v6.4S -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v20.4S -mul v3.4S, v3.4S,v8.4S -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v13.4S -mul v16.4S, v16.4S,v11.4S -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -str q9, [x0, #0] -str q19, [x0, #16] -str q21, [x0, #32] -str q3, [x0, #48] -ldr q3, [x17, #+256] -ldr q21, [x17, #+272] -ldr q19, [x17, #+288] -ldr q9, [x17, #+304] -ldr q16, [x17, #+320] -ldr q14, [x17, #+336] -ldr q18, [x17, #+352] -ldr q17, [x17, #+368] -ldr q13, [x0, #96] -ldr q11, [x0, #112] -ldr q20, [x0, #64] -ldr q8, [x0, #80] -sqrdmulh v7.4S, v13.4S, v21.s[0] -mul v13.4S, v13.4S,v3.s[0] -mla v13.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v21.s[0] -mul v11.4S, v11.4S,v3.s[0] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v21.s[1] -mul v8.4S, v8.4S,v3.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v8.4s -add v20.4s, v20.4s, v8.4s -sqrdmulh v8.4S, v13.4S, v21.s[2] -mul v13.4S, v13.4S,v3.s[2] -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v13.4s -add v7.4s, v7.4s, v13.4s -trn1 v13.4S, v20.4S, v11.4S -trn2 v6.4S, v20.4S, v11.4S -trn1 v5.4S, v7.4S, v8.4S -trn2 v4.4S, v7.4S, v8.4S -trn2 v7.2D, v13.2D, v5.2D -trn2 v8.2D, v6.2D, v4.2D -trn1 v20.2D, v13.2D, v5.2D -trn1 v11.2D, v6.2D, v4.2D -sqrdmulh v4.4S, v7.4S, v9.4S -mul v7.4S, v7.4S,v19.4S -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v7.4s -add v20.4s, v20.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v9.4S -mul v8.4S, v8.4S,v19.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -sqrdmulh v8.4S, v11.4S, v14.4S -mul v11.4S, v11.4S,v16.4S -mla v11.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v7.4S, v17.4S -mul v7.4S, v7.4S,v18.4S -mla v7.4S, v11.4S, v31.s[0] -sub v11.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -str q20, [x0, #64] -str q8, [x0, #80] -str q4, [x0, #96] -str q11, [x0, #112] -ldr q11, [x17, #+384] -ldr q4, [x17, #+400] -ldr q8, [x17, #+416] -ldr q20, [x17, #+432] -ldr q7, [x17, #+448] -ldr q6, [x17, #+464] -ldr q5, [x17, #+480] -ldr q13, [x17, #+496] -ldr q17, [x0, #160] -ldr q18, [x0, #176] -ldr q14, [x0, #128] -ldr q16, [x0, #144] -sqrdmulh v9.4S, v17.4S, v4.s[0] -mul v17.4S, v17.4S,v11.s[0] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v4.s[0] -mul v18.4S, v18.4S,v11.s[0] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v4.s[1] -mul v16.4S, v16.4S,v11.s[1] -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v16.4s -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v4.s[2] -mul v17.4S, v17.4S,v11.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -trn1 v17.4S, v14.4S, v18.4S -trn2 v19.4S, v14.4S, v18.4S -trn1 v21.4S, v9.4S, v16.4S -trn2 v3.4S, v9.4S, v16.4S -trn2 v9.2D, v17.2D, v21.2D -trn2 v16.2D, v19.2D, v3.2D -trn1 v14.2D, v17.2D, v21.2D -trn1 v18.2D, v19.2D, v3.2D -sqrdmulh v3.4S, v9.4S, v20.4S -mul v9.4S, v9.4S,v8.4S -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v9.4s -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v20.4S -mul v16.4S, v16.4S,v8.4S -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v6.4S -mul v18.4S, v18.4S,v7.4S -mla v18.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v9.4S, v13.4S -mul v9.4S, v9.4S,v5.4S -mla v9.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -str q14, [x0, #128] -str q16, [x0, #144] -str q3, [x0, #160] -str q18, [x0, #176] -ldr q18, [x17, #+512] -ldr q3, [x17, #+528] -ldr q16, [x17, #+544] -ldr q14, [x17, #+560] -ldr q9, [x17, #+576] -ldr q19, [x17, #+592] -ldr q21, [x17, #+608] -ldr q17, [x17, #+624] -ldr q13, [x0, #224] -ldr q5, [x0, #240] -ldr q6, [x0, #192] -ldr q7, [x0, #208] -sqrdmulh v20.4S, v13.4S, v3.s[0] -mul v13.4S, v13.4S,v18.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v5.4S, v3.s[0] -mul v5.4S, v5.4S,v18.s[0] -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v3.s[1] -mul v7.4S, v7.4S,v18.s[1] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v13.4S, v3.s[2] -mul v13.4S, v13.4S,v18.s[2] -mla v13.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -trn1 v13.4S, v6.4S, v5.4S -trn2 v8.4S, v6.4S, v5.4S -trn1 v4.4S, v20.4S, v7.4S -trn2 v11.4S, v20.4S, v7.4S -trn2 v20.2D, v13.2D, v4.2D -trn2 v7.2D, v8.2D, v11.2D -trn1 v6.2D, v13.2D, v4.2D -trn1 v5.2D, v8.2D, v11.2D -sqrdmulh v11.4S, v20.4S, v14.4S -mul v20.4S, v20.4S,v16.4S -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v14.4S -mul v7.4S, v7.4S,v16.4S -mla v7.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v19.4S -mul v5.4S, v5.4S,v9.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v17.4S -mul v20.4S, v20.4S,v21.4S -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -str q6, [x0, #192] -str q7, [x0, #208] -str q11, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q11, [x17, #+656] -ldr q7, [x17, #+672] -ldr q6, [x17, #+688] -ldr q20, [x17, #+704] -ldr q8, [x17, #+720] -ldr q4, [x17, #+736] -ldr q13, [x17, #+752] -ldr q17, [x0, #288] -ldr q21, [x0, #304] -ldr q19, [x0, #256] -ldr q9, [x0, #272] -sqrdmulh v14.4S, v17.4S, v11.s[0] -mul v17.4S, v17.4S,v5.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v11.s[0] -mul v21.4S, v21.4S,v5.s[0] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -sqrdmulh v21.4S, v9.4S, v11.s[1] -mul v9.4S, v9.4S,v5.s[1] -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v9.4s -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v11.s[2] -mul v17.4S, v17.4S,v5.s[2] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v19.4S, v21.4S -trn2 v16.4S, v19.4S, v21.4S -trn1 v3.4S, v14.4S, v9.4S -trn2 v18.4S, v14.4S, v9.4S -trn2 v14.2D, v17.2D, v3.2D -trn2 v9.2D, v16.2D, v18.2D -trn1 v19.2D, v17.2D, v3.2D -trn1 v21.2D, v16.2D, v18.2D -sqrdmulh v18.4S, v14.4S, v6.4S -mul v14.4S, v14.4S,v7.4S -mla v14.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v6.4S -mul v9.4S, v9.4S,v7.4S -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v9.4s -add v21.4s, v21.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v8.4S -mul v21.4S, v21.4S,v20.4S -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v13.4S -mul v14.4S, v14.4S,v4.4S -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -str q19, [x0, #256] -str q9, [x0, #272] -str q18, [x0, #288] -str q21, [x0, #304] -ldr q21, [x17, #+768] -ldr q18, [x17, #+784] -ldr q9, [x17, #+800] -ldr q19, [x17, #+816] -ldr q14, [x17, #+832] -ldr q16, [x17, #+848] -ldr q3, [x17, #+864] -ldr q17, [x17, #+880] -ldr q13, [x0, #352] -ldr q4, [x0, #368] -ldr q8, [x0, #320] -ldr q20, [x0, #336] -sqrdmulh v6.4S, v13.4S, v18.s[0] -mul v13.4S, v13.4S,v21.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v18.s[1] -mul v20.4S, v20.4S,v21.s[1] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v18.s[2] -mul v13.4S, v13.4S,v21.s[2] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -trn1 v13.4S, v8.4S, v4.4S -trn2 v7.4S, v8.4S, v4.4S -trn1 v11.4S, v6.4S, v20.4S -trn2 v5.4S, v6.4S, v20.4S -trn2 v6.2D, v13.2D, v11.2D -trn2 v20.2D, v7.2D, v5.2D -trn1 v8.2D, v13.2D, v11.2D -trn1 v4.2D, v7.2D, v5.2D -sqrdmulh v5.4S, v6.4S, v19.4S -mul v6.4S, v6.4S,v9.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v20.4S, v19.4S -mul v20.4S, v20.4S,v9.4S -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v4.4S, v16.4S -mul v4.4S, v4.4S,v14.4S -mla v4.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v17.4S -mul v6.4S, v6.4S,v3.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -str q8, [x0, #320] -str q20, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q20, [x17, #+928] -ldr q8, [x17, #+944] -ldr q6, [x17, #+960] -ldr q7, [x17, #+976] -ldr q11, [x17, #+992] -ldr q13, [x17, #+1008] -ldr q17, [x0, #416] -ldr q3, [x0, #432] -ldr q16, [x0, #384] -ldr q14, [x0, #400] -sqrdmulh v19.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v5.s[0] -mul v3.4S, v3.4S,v4.s[0] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v5.s[1] -mul v14.4S, v14.4S,v4.s[1] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -trn1 v17.4S, v16.4S, v3.4S -trn2 v9.4S, v16.4S, v3.4S -trn1 v18.4S, v19.4S, v14.4S -trn2 v21.4S, v19.4S, v14.4S -trn2 v19.2D, v17.2D, v18.2D -trn2 v14.2D, v9.2D, v21.2D -trn1 v16.2D, v17.2D, v18.2D -trn1 v3.2D, v9.2D, v21.2D -sqrdmulh v21.4S, v19.4S, v8.4S -mul v19.4S, v19.4S,v20.4S -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v19.4s -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v8.4S -mul v14.4S, v14.4S,v20.4S -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v7.4S -mul v3.4S, v3.4S,v6.4S -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v13.4S -mul v19.4S, v19.4S,v11.4S -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -str q16, [x0, #384] -str q14, [x0, #400] -str q21, [x0, #416] -str q3, [x0, #432] -ldr q3, [x17, #+1024] -ldr q21, [x17, #+1040] -ldr q14, [x17, #+1056] -ldr q16, [x17, #+1072] -ldr q19, [x17, #+1088] -ldr q9, [x17, #+1104] -ldr q18, [x17, #+1120] -ldr q17, [x17, #+1136] -ldr q13, [x0, #480] -ldr q11, [x0, #496] -ldr q7, [x0, #448] -ldr q6, [x0, #464] -sqrdmulh v8.4S, v13.4S, v21.s[0] -mul v13.4S, v13.4S,v3.s[0] -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v13.4s -add v7.4s, v7.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v21.s[0] -mul v11.4S, v11.4S,v3.s[0] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v6.4s, v11.4s -add v6.4s, v6.4s, v11.4s -sqrdmulh v11.4S, v6.4S, v21.s[1] -mul v6.4S, v6.4S,v3.s[1] -mla v6.4S, v11.4S, v31.s[0] -sub v11.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v21.s[2] -mul v13.4S, v13.4S,v3.s[2] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -trn1 v13.4S, v7.4S, v11.4S -trn2 v20.4S, v7.4S, v11.4S -trn1 v5.4S, v8.4S, v6.4S -trn2 v4.4S, v8.4S, v6.4S -trn2 v8.2D, v13.2D, v5.2D -trn2 v6.2D, v20.2D, v4.2D -trn1 v7.2D, v13.2D, v5.2D -trn1 v11.2D, v20.2D, v4.2D -sqrdmulh v4.4S, v8.4S, v16.4S -mul v8.4S, v8.4S,v14.4S -mla v8.4S, v4.4S, v31.s[0] -sub v4.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v16.4S -mul v6.4S, v6.4S,v14.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v9.4S -mul v11.4S, v11.4S,v19.4S -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v11.4s -add v7.4s, v7.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v17.4S -mul v8.4S, v8.4S,v18.4S -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v4.4s, v8.4s -add v4.4s, v4.4s, v8.4s -str q7, [x0, #448] -str q6, [x0, #464] -str q4, [x0, #480] -str q11, [x0, #496] -ldr q11, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q7, [x17, #+1200] -ldr q8, [x17, #+1216] -ldr q20, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q13, [x17, #+1264] -ldr q17, [x0, #544] -ldr q18, [x0, #560] -ldr q9, [x0, #512] -ldr q19, [x0, #528] -sqrdmulh v16.4S, v17.4S, v4.s[0] -mul v17.4S, v17.4S,v11.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v4.s[0] -mul v18.4S, v18.4S,v11.s[0] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v4.s[1] -mul v19.4S, v19.4S,v11.s[1] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v19.4s -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v4.s[2] -mul v17.4S, v17.4S,v11.s[2] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -trn1 v17.4S, v9.4S, v18.4S -trn2 v14.4S, v9.4S, v18.4S -trn1 v21.4S, v16.4S, v19.4S -trn2 v3.4S, v16.4S, v19.4S -trn2 v16.2D, v17.2D, v21.2D -trn2 v19.2D, v14.2D, v3.2D -trn1 v9.2D, v17.2D, v21.2D -trn1 v18.2D, v14.2D, v3.2D -sqrdmulh v3.4S, v16.4S, v7.4S -mul v16.4S, v16.4S,v6.4S -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v7.4S -mul v19.4S, v19.4S,v6.4S -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v20.4S -mul v18.4S, v18.4S,v8.4S -mla v18.4S, v19.4S, v31.s[0] -sub v19.4s, v9.4s, v18.4s -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v13.4S -mul v16.4S, v16.4S,v5.4S -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -str q9, [x0, #512] -str q19, [x0, #528] -str q3, [x0, #544] -str q18, [x0, #560] -ldr q18, [x17, #+1280] -ldr q3, [x17, #+1296] -ldr q19, [x17, #+1312] -ldr q9, [x17, #+1328] -ldr q16, [x17, #+1344] -ldr q14, [x17, #+1360] -ldr q21, [x17, #+1376] -ldr q17, [x17, #+1392] -ldr q13, [x0, #608] -ldr q5, [x0, #624] -ldr q20, [x0, #576] -ldr q8, [x0, #592] -sqrdmulh v7.4S, v13.4S, v3.s[0] -mul v13.4S, v13.4S,v18.s[0] -mla v13.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -sqrdmulh v13.4S, v5.4S, v3.s[0] -mul v5.4S, v5.4S,v18.s[0] -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v5.4s -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v3.s[1] -mul v8.4S, v8.4S,v18.s[1] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v8.4s -add v20.4s, v20.4s, v8.4s -sqrdmulh v8.4S, v13.4S, v3.s[2] -mul v13.4S, v13.4S,v18.s[2] -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v13.4s -add v7.4s, v7.4s, v13.4s -trn1 v13.4S, v20.4S, v5.4S -trn2 v6.4S, v20.4S, v5.4S -trn1 v4.4S, v7.4S, v8.4S -trn2 v11.4S, v7.4S, v8.4S -trn2 v7.2D, v13.2D, v4.2D -trn2 v8.2D, v6.2D, v11.2D -trn1 v20.2D, v13.2D, v4.2D -trn1 v5.2D, v6.2D, v11.2D -sqrdmulh v11.4S, v7.4S, v9.4S -mul v7.4S, v7.4S,v19.4S -mla v7.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v7.4s -add v20.4s, v20.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v9.4S -mul v8.4S, v8.4S,v19.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v14.4S -mul v5.4S, v5.4S,v16.4S -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v17.4S -mul v7.4S, v7.4S,v21.4S -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v7.4s -add v11.4s, v11.4s, v7.4s -str q20, [x0, #576] -str q8, [x0, #592] -str q11, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q11, [x17, #+1424] -ldr q8, [x17, #+1440] -ldr q20, [x17, #+1456] -ldr q7, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q13, [x17, #+1520] -ldr q17, [x0, #672] -ldr q21, [x0, #688] -ldr q14, [x0, #640] -ldr q16, [x0, #656] -sqrdmulh v9.4S, v17.4S, v11.s[0] -mul v17.4S, v17.4S,v5.s[0] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v11.s[0] -mul v21.4S, v21.4S,v5.s[0] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v11.s[1] -mul v16.4S, v16.4S,v5.s[1] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v16.4s -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v11.s[2] -mul v17.4S, v17.4S,v5.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v17.4s -add v9.4s, v9.4s, v17.4s -trn1 v17.4S, v14.4S, v21.4S -trn2 v19.4S, v14.4S, v21.4S -trn1 v3.4S, v9.4S, v16.4S -trn2 v18.4S, v9.4S, v16.4S -trn2 v9.2D, v17.2D, v3.2D -trn2 v16.2D, v19.2D, v18.2D -trn1 v14.2D, v17.2D, v3.2D -trn1 v21.2D, v19.2D, v18.2D -sqrdmulh v18.4S, v9.4S, v20.4S -mul v9.4S, v9.4S,v8.4S -mla v9.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v9.4s -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v20.4S -mul v16.4S, v16.4S,v8.4S -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v6.4S -mul v21.4S, v21.4S,v7.4S -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v9.4S, v13.4S -mul v9.4S, v9.4S,v4.4S -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v9.4s -add v18.4s, v18.4s, v9.4s -str q14, [x0, #640] -str q16, [x0, #656] -str q18, [x0, #672] -str q21, [x0, #688] -ldr q21, [x17, #+1536] -ldr q18, [x17, #+1552] -ldr q16, [x17, #+1568] -ldr q14, [x17, #+1584] -ldr q9, [x17, #+1600] -ldr q19, [x17, #+1616] -ldr q3, [x17, #+1632] -ldr q17, [x17, #+1648] -ldr q13, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q7, [x0, #720] -sqrdmulh v20.4S, v13.4S, v18.s[0] -mul v13.4S, v13.4S,v21.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v18.s[1] -mul v7.4S, v7.4S,v21.s[1] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v13.4S, v18.s[2] -mul v13.4S, v13.4S,v21.s[2] -mla v13.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -trn1 v13.4S, v6.4S, v4.4S -trn2 v8.4S, v6.4S, v4.4S -trn1 v11.4S, v20.4S, v7.4S -trn2 v5.4S, v20.4S, v7.4S -trn2 v20.2D, v13.2D, v11.2D -trn2 v7.2D, v8.2D, v5.2D -trn1 v6.2D, v13.2D, v11.2D -trn1 v4.2D, v8.2D, v5.2D -sqrdmulh v5.4S, v20.4S, v14.4S -mul v20.4S, v20.4S,v16.4S -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v14.4S -mul v7.4S, v7.4S,v16.4S -mla v7.4S, v20.4S, v31.s[0] -sub v20.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v19.4S -mul v4.4S, v4.4S,v9.4S -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v17.4S -mul v20.4S, v20.4S,v3.4S -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -str q6, [x0, #704] -str q7, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q7, [x17, #+1696] -ldr q6, [x17, #+1712] -ldr q20, [x17, #+1728] -ldr q8, [x17, #+1744] -ldr q11, [x17, #+1760] -ldr q13, [x17, #+1776] -ldr q17, [x0, #800] -ldr q3, [x0, #816] -ldr q19, [x0, #768] -ldr q9, [x0, #784] -sqrdmulh v14.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v5.s[0] -mul v3.4S, v3.4S,v4.s[0] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v5.s[1] -mul v9.4S, v9.4S,v4.s[1] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v9.4s -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v19.4S, v3.4S -trn2 v16.4S, v19.4S, v3.4S -trn1 v18.4S, v14.4S, v9.4S -trn2 v21.4S, v14.4S, v9.4S -trn2 v14.2D, v17.2D, v18.2D -trn2 v9.2D, v16.2D, v21.2D -trn1 v19.2D, v17.2D, v18.2D -trn1 v3.2D, v16.2D, v21.2D -sqrdmulh v21.4S, v14.4S, v6.4S -mul v14.4S, v14.4S,v7.4S -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v6.4S -mul v9.4S, v9.4S,v7.4S -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -sqrdmulh v9.4S, v3.4S, v8.4S -mul v3.4S, v3.4S,v20.4S -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v13.4S -mul v14.4S, v14.4S,v11.4S -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -str q19, [x0, #768] -str q9, [x0, #784] -str q21, [x0, #800] -str q3, [x0, #816] -ldr q3, [x17, #+1792] -ldr q21, [x17, #+1808] -ldr q9, [x17, #+1824] -ldr q19, [x17, #+1840] -ldr q14, [x17, #+1856] -ldr q16, [x17, #+1872] -ldr q18, [x17, #+1888] -ldr q17, [x17, #+1904] -ldr q13, [x0, #864] -ldr q11, [x0, #880] -ldr q8, [x0, #832] -ldr q20, [x0, #848] -sqrdmulh v6.4S, v13.4S, v21.s[0] -mul v13.4S, v13.4S,v3.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v21.s[0] -mul v11.4S, v11.4S,v3.s[0] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v20.4S, v21.s[1] -mul v20.4S, v20.4S,v3.s[1] -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v21.s[2] -mul v13.4S, v13.4S,v3.s[2] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -trn1 v13.4S, v8.4S, v11.4S -trn2 v7.4S, v8.4S, v11.4S -trn1 v5.4S, v6.4S, v20.4S -trn2 v4.4S, v6.4S, v20.4S -trn2 v6.2D, v13.2D, v5.2D -trn2 v20.2D, v7.2D, v4.2D -trn1 v8.2D, v13.2D, v5.2D -trn1 v11.2D, v7.2D, v4.2D -sqrdmulh v4.4S, v6.4S, v19.4S -mul v6.4S, v6.4S,v9.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v20.4S, v19.4S -mul v20.4S, v20.4S,v9.4S -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v16.4S -mul v11.4S, v11.4S,v14.4S -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v6.4S, v17.4S -mul v6.4S, v6.4S,v18.4S -mla v6.4S, v11.4S, v31.s[0] -sub v11.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -str q8, [x0, #832] -str q20, [x0, #848] -str q4, [x0, #864] -str q11, [x0, #880] -ldr q11, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q20, [x17, #+1952] -ldr q8, [x17, #+1968] -ldr q6, [x17, #+1984] -ldr q7, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q13, [x17, #+2032] -ldr q17, [x0, #928] -ldr q18, [x0, #944] -ldr q16, [x0, #896] -ldr q14, [x0, #912] -sqrdmulh v19.4S, v17.4S, v4.s[0] -mul v17.4S, v17.4S,v11.s[0] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v4.s[0] -mul v18.4S, v18.4S,v11.s[0] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v14.4S, v4.s[1] -mul v14.4S, v14.4S,v11.s[1] -mla v14.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v17.4S, v4.s[2] -mul v17.4S, v17.4S,v11.s[2] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -trn1 v17.4S, v16.4S, v18.4S -trn2 v9.4S, v16.4S, v18.4S -trn1 v21.4S, v19.4S, v14.4S -trn2 v3.4S, v19.4S, v14.4S -trn2 v19.2D, v17.2D, v21.2D -trn2 v14.2D, v9.2D, v3.2D -trn1 v16.2D, v17.2D, v21.2D -trn1 v18.2D, v9.2D, v3.2D -sqrdmulh v3.4S, v19.4S, v8.4S -mul v19.4S, v19.4S,v20.4S -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v19.4s -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v8.4S -mul v14.4S, v14.4S,v20.4S -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v7.4S -mul v18.4S, v18.4S,v6.4S -mla v18.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v13.4S -mul v19.4S, v19.4S,v5.4S -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -str q16, [x0, #896] -str q14, [x0, #912] -str q3, [x0, #928] -str q18, [x0, #944] -ldr q18, [x17, #+2048] -ldr q3, [x17, #+2064] -ldr q14, [x17, #+2080] -ldr q16, [x17, #+2096] -ldr q19, [x17, #+2112] -ldr q9, [x17, #+2128] -ldr q21, [x17, #+2144] -ldr q17, [x17, #+2160] -ldr q13, [x0, #992] -ldr q5, [x0, #1008] -ldr q7, [x0, #960] -ldr q6, [x0, #976] -sqrdmulh v8.4S, v13.4S, v3.s[0] -mul v13.4S, v13.4S,v18.s[0] -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v13.4s -add v7.4s, v7.4s, v13.4s -sqrdmulh v13.4S, v5.4S, v3.s[0] -mul v5.4S, v5.4S,v18.s[0] -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v3.s[1] -mul v6.4S, v6.4S,v18.s[1] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v3.s[2] -mul v13.4S, v13.4S,v18.s[2] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -trn1 v13.4S, v7.4S, v5.4S -trn2 v20.4S, v7.4S, v5.4S -trn1 v4.4S, v8.4S, v6.4S -trn2 v11.4S, v8.4S, v6.4S -trn2 v8.2D, v13.2D, v4.2D -trn2 v6.2D, v20.2D, v11.2D -trn1 v7.2D, v13.2D, v4.2D -trn1 v5.2D, v20.2D, v11.2D -sqrdmulh v11.4S, v8.4S, v16.4S -mul v8.4S, v8.4S,v14.4S -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v16.4S -mul v6.4S, v6.4S,v14.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v9.4S -mul v5.4S, v5.4S,v19.4S -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v17.4S -mul v8.4S, v8.4S,v21.4S -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -str q7, [x0, #960] -str q6, [x0, #976] -str q11, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s deleted file mode 100644 index 1f606a8..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_2_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_2_0 -.global _ntt_u32_full_neon_asm_var_4_4_2_0 -ntt_u32_full_neon_asm_var_4_4_2_0: -_ntt_u32_full_neon_asm_var_4_4_2_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q11, [x0, #96] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q10, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q9, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -mla v15.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v27.s[0] -mul v10.4S, v10.4S,v28.s[0] -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v9.4S, v27.s[0] -mul v9.4S, v9.4S,v28.s[0] -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -mla v9.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v10.4s -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v13.4S, v27.s[1] -mul v13.4S, v13.4S,v28.s[1] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v11.4s, v9.4s -add v11.4s, v11.4s, v9.4s -sqrdmulh v9.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v13.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v14.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v1.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v13.4s -add v3.4s, v3.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -mla v0.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v25.s[0] -mul v11.4S, v11.4S,v26.s[0] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v25.s[1] -mul v2.4S, v2.4S,v26.s[1] -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v9.4S, v25.s[3] -mul v9.4S, v9.4S,v26.s[3] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v19.4S, v23.s[0] -str q18, [x0, #96] -mul v19.4S, v19.4S,v24.s[0] -ldr q18, [x0, #816] -mla v9.4S, v11.4S, v31.s[0] -ldr q11, [x0, #880] -sub v2.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -str q15, [x0, #160] -sqrdmulh v15.4S, v13.4S, v23.s[1] -str q17, [x0, #224] -mul v13.4S, v13.4S,v24.s[1] -ldr q17, [x0, #944] -mla v19.4S, v12.4S, v31.s[0] -ldr q12, [x0, #1008] -sub v3.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -str q16, [x0, #288] -sqrdmulh v16.4S, v21.4S, v23.s[2] -str q2, [x0, #352] -mul v21.4S, v21.4S,v24.s[2] -ldr q2, [x0, #304] -mla v13.4S, v15.4S, v31.s[0] -ldr q15, [x0, #368] -sub v9.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v0.4S, v23.s[3] -str q3, [x0, #480] -mul v0.4S, v0.4S,v24.s[3] -ldr q3, [x0, #432] -mla v21.4S, v16.4S, v31.s[0] -ldr q16, [x0, #496] -sub v19.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -str q20, [x0, #544] -sqrdmulh v20.4S, v18.4S, v29.s[0] -str q9, [x0, #608] -ldr q9, [x0, #560] -mul v18.4S, v18.4S,v30.s[0] -ldr q13, [x0, #624] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -str q14, [x0, #672] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q19, [x0, #736] -ldr q19, [x0, #688] -mul v11.4S, v11.4S,v30.s[0] -ldr q21, [x0, #752] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v1.4s, v0.4s -add v1.4s, v1.4s, v0.4s -str q22, [x0, #800] -sqrdmulh v22.4S, v17.4S, v29.s[0] -str q10, [x0, #864] -mul v17.4S, v17.4S,v30.s[0] -ldr q10, [x0, #48] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -str q1, [x0, #928] -sqrdmulh v1.4S, v12.4S, v29.s[0] -str q20, [x0, #992] -mul v12.4S, v12.4S,v30.s[0] -ldr q20, [x0, #112] -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v9.4S, v29.s[0] -ldr q18, [x0, #176] -mul v9.4S, v9.4S,v30.s[0] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v29.s[0] -ldr q0, [x0, #240] -mul v13.4S, v13.4S,v30.s[0] -mla v9.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v19.4S, v12.4S, v31.s[0] -sub v12.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -sqrdmulh v13.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v3.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v9.4s, v1.4s -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v14.4S, v15.4S, v31.s[0] -sub v15.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v27.s[0] -mul v18.4S, v18.4S,v28.s[0] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v0.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v27.s[1] -mul v21.4S, v21.4S,v28.s[1] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[2] -mul v9.4S, v9.4S,v28.s[2] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v9.4S, v0.4S, v31.s[0] -sub v0.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v27.s[3] -mul v2.4S, v2.4S,v28.s[3] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v25.s[0] -mul v20.4S, v20.4S,v26.s[0] -mla v15.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v22.4S, v25.s[1] -mul v22.4S, v22.4S,v26.s[1] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v20.4s -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v25.s[3] -mul v0.4S, v0.4S,v26.s[3] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -str q10, [x0, #48] -sqrdmulh v10.4S, v12.4S, v23.s[0] -str q2, [x0, #112] -mul v12.4S, v12.4S,v24.s[0] -ldr q2, [x0, #768] -mla v0.4S, v20.4S, v31.s[0] -ldr q20, [x0, #832] -sub v22.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -str q14, [x0, #176] -sqrdmulh v14.4S, v21.4S, v23.s[1] -str q15, [x0, #240] -mul v21.4S, v21.4S,v24.s[1] -ldr q15, [x0, #896] -mla v12.4S, v10.4S, v31.s[0] -ldr q10, [x0, #960] -sub v16.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -str q3, [x0, #304] -sqrdmulh v3.4S, v11.4S, v23.s[2] -str q22, [x0, #368] -mul v11.4S, v11.4S,v24.s[2] -ldr q22, [x0, #256] -mla v21.4S, v14.4S, v31.s[0] -ldr q14, [x0, #320] -sub v0.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -str q18, [x0, #432] -sqrdmulh v18.4S, v13.4S, v23.s[3] -str q16, [x0, #496] -mul v13.4S, v13.4S,v24.s[3] -ldr q16, [x0, #384] -mla v11.4S, v3.4S, v31.s[0] -ldr q3, [x0, #448] -sub v12.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -str q17, [x0, #560] -sqrdmulh v17.4S, v2.4S, v29.s[0] -str q0, [x0, #624] -ldr q0, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q21, [x0, #576] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -str q19, [x0, #688] -sqrdmulh v19.4S, v20.4S, v29.s[0] -str q12, [x0, #752] -ldr q12, [x0, #640] -mul v20.4S, v20.4S,v30.s[0] -ldr q11, [x0, #704] -mla v2.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -str q1, [x0, #816] -sqrdmulh v1.4S, v15.4S, v29.s[0] -str q18, [x0, #880] -mul v15.4S, v15.4S,v30.s[0] -ldr q18, [x0, #0] -mla v20.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -str q9, [x0, #944] -sqrdmulh v9.4S, v10.4S, v29.s[0] -str q17, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q17, [x0, #64] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q2, [x0, #128] -mul v0.4S, v0.4S,v30.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v16.4s, v15.4s -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q13, [x0, #192] -mul v21.4S, v21.4S,v30.s[0] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v10.4s -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v2.4s, v12.4s -add v2.4s, v2.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v9.4s -add v0.4s, v0.4s, v9.4s -sqrdmulh v9.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v19.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -mla v1.4S, v9.4S, v31.s[0] -sub v9.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v11.4S, v27.s[1] -mul v11.4S, v11.4S,v28.s[1] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v26.s[0] -mla v14.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v22.4s -add v9.4s, v9.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v25.s[1] -mul v1.4S, v1.4S,v26.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -str q18, [x0, #0] -sqrdmulh v18.4S, v10.4S, v23.s[0] -str q22, [x0, #64] -mul v10.4S, v10.4S,v24.s[0] -ldr q22, [x0, #784] -mla v13.4S, v17.4S, v31.s[0] -ldr q17, [x0, #848] -sub v1.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -str q19, [x0, #128] -sqrdmulh v19.4S, v11.4S, v23.s[1] -str q14, [x0, #192] -mul v11.4S, v11.4S,v24.s[1] -ldr q14, [x0, #912] -mla v10.4S, v18.4S, v31.s[0] -ldr q18, [x0, #976] -sub v3.4s, v2.4s, v13.4s -add v2.4s, v2.4s, v13.4s -str q16, [x0, #256] -sqrdmulh v16.4S, v20.4S, v23.s[2] -str q1, [x0, #320] -mul v20.4S, v20.4S,v24.s[2] -ldr q1, [x0, #272] -mla v11.4S, v19.4S, v31.s[0] -ldr q19, [x0, #336] -sub v13.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v21.4S, v23.s[3] -str q3, [x0, #448] -mul v21.4S, v21.4S,v24.s[3] -ldr q3, [x0, #400] -mla v20.4S, v16.4S, v31.s[0] -ldr q16, [x0, #464] -sub v10.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -str q15, [x0, #512] -sqrdmulh v15.4S, v22.4S, v29.s[0] -str q13, [x0, #576] -ldr q13, [x0, #528] -mul v22.4S, v22.4S,v30.s[0] -ldr q11, [x0, #592] -mla v21.4S, v2.4S, v31.s[0] -sub v2.4s, v9.4s, v20.4s -add v9.4s, v9.4s, v20.4s -str q12, [x0, #640] -sqrdmulh v12.4S, v17.4S, v29.s[0] -str q10, [x0, #704] -ldr q10, [x0, #656] -mul v17.4S, v17.4S,v30.s[0] -ldr q20, [x0, #720] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -str q9, [x0, #768] -sqrdmulh v9.4S, v14.4S, v29.s[0] -str q2, [x0, #832] -mul v14.4S, v14.4S,v30.s[0] -ldr q2, [x0, #16] -mla v17.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -str q0, [x0, #896] -sqrdmulh v0.4S, v18.4S, v29.s[0] -str q15, [x0, #960] -mul v18.4S, v18.4S,v30.s[0] -ldr q15, [x0, #80] -mla v14.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v29.s[0] -ldr q22, [x0, #144] -mul v13.4S, v13.4S,v30.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v29.s[0] -ldr q21, [x0, #208] -mul v11.4S, v11.4S,v30.s[0] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v13.4s -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v3.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v1.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v0.4s -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v9.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v27.s[0] -mul v21.4S, v21.4S,v28.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v9.4s -add v18.4s, v18.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v27.s[1] -mul v10.4S, v10.4S,v28.s[1] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v3.4s, v10.4s -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -mla v11.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -mla v1.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v1.4S, v9.4S, v25.s[1] -mul v9.4S, v9.4S,v26.s[1] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v25.s[3] -mul v21.4S, v21.4S,v26.s[3] -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v9.4s -add v12.4s, v12.4s, v9.4s -str q2, [x0, #16] -sqrdmulh v2.4S, v18.4S, v23.s[0] -str q1, [x0, #80] -mul v18.4S, v18.4S,v24.s[0] -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -str q12, [x0, #144] -sqrdmulh v12.4S, v20.4S, v23.s[1] -str q19, [x0, #208] -mul v20.4S, v20.4S,v24.s[1] -mla v18.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -str q3, [x0, #272] -sqrdmulh v3.4S, v17.4S, v23.s[2] -str q15, [x0, #336] -mul v17.4S, v17.4S,v24.s[2] -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -str q22, [x0, #400] -sqrdmulh v22.4S, v11.4S, v23.s[3] -str q2, [x0, #464] -mul v11.4S, v11.4S,v24.s[3] -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v20.4s -add v10.4s, v10.4s, v20.4s -str q14, [x0, #528] -str q12, [x0, #592] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -str q10, [x0, #656] -str q3, [x0, #720] -sub v3.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -str q0, [x0, #784] -str q22, [x0, #848] -str q13, [x0, #912] -str q3, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q9, [x17, #+208] -ldr q1, [x17, #+224] -ldr q16, [x17, #+240] -ldr q19, [x0, #32] -ldr q21, [x0, #48] -ldr q15, [x0, #0] -ldr q18, [x0, #16] -sqrdmulh v2.4S, v19.4S, v5.s[0] -mul v19.4S, v19.4S,v4.s[0] -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v5.s[1] -mul v18.4S, v18.4S,v4.s[1] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v5.s[2] -mul v19.4S, v19.4S,v4.s[2] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -trn1 v19.4S, v15.4S, v21.4S -trn2 v20.4S, v15.4S, v21.4S -trn1 v14.4S, v2.4S, v18.4S -trn2 v12.4S, v2.4S, v18.4S -trn2 v2.2D, v19.2D, v14.2D -trn2 v18.2D, v20.2D, v12.2D -trn1 v15.2D, v19.2D, v14.2D -trn1 v21.2D, v20.2D, v12.2D -sqrdmulh v12.4S, v2.4S, v7.4S -mul v2.4S, v2.4S,v6.4S -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v18.4S, v7.4S -mul v18.4S, v18.4S,v6.4S -mla v18.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.4S -mul v21.4S, v21.4S,v8.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v16.4S -mul v2.4S, v2.4S,v1.4S -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -str q15, [x0, #0] -str q18, [x0, #16] -str q12, [x0, #32] -str q21, [x0, #48] -ldr q21, [x17, #+256] -ldr q12, [x17, #+272] -ldr q18, [x17, #+288] -ldr q15, [x17, #+304] -ldr q2, [x17, #+320] -ldr q20, [x17, #+336] -ldr q14, [x17, #+352] -ldr q19, [x17, #+368] -ldr q16, [x0, #96] -ldr q1, [x0, #112] -ldr q9, [x0, #64] -ldr q8, [x0, #80] -sqrdmulh v7.4S, v16.4S, v12.s[0] -mul v16.4S, v16.4S,v21.s[0] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v12.s[0] -mul v1.4S, v1.4S,v21.s[0] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v12.s[1] -mul v8.4S, v8.4S,v21.s[1] -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v12.s[2] -mul v16.4S, v16.4S,v21.s[2] -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -trn1 v16.4S, v9.4S, v1.4S -trn2 v6.4S, v9.4S, v1.4S -trn1 v5.4S, v7.4S, v8.4S -trn2 v4.4S, v7.4S, v8.4S -trn2 v7.2D, v16.2D, v5.2D -trn2 v8.2D, v6.2D, v4.2D -trn1 v9.2D, v16.2D, v5.2D -trn1 v1.2D, v6.2D, v4.2D -sqrdmulh v4.4S, v7.4S, v15.4S -mul v7.4S, v7.4S,v18.4S -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v15.4S -mul v8.4S, v8.4S,v18.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v1.4s, v8.4s -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v1.4S, v20.4S -mul v1.4S, v1.4S,v2.4S -mla v1.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v1.4s -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v7.4S, v19.4S -mul v7.4S, v7.4S,v14.4S -mla v7.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -str q9, [x0, #64] -str q8, [x0, #80] -str q4, [x0, #96] -str q1, [x0, #112] -ldr q1, [x17, #+384] -ldr q4, [x17, #+400] -ldr q8, [x17, #+416] -ldr q9, [x17, #+432] -ldr q7, [x17, #+448] -ldr q6, [x17, #+464] -ldr q5, [x17, #+480] -ldr q16, [x17, #+496] -ldr q19, [x0, #160] -ldr q14, [x0, #176] -ldr q20, [x0, #128] -ldr q2, [x0, #144] -sqrdmulh v15.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v1.s[0] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v1.s[0] -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v4.s[1] -mul v2.4S, v2.4S,v1.s[1] -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v4.s[2] -mul v19.4S, v19.4S,v1.s[2] -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -trn1 v19.4S, v20.4S, v14.4S -trn2 v18.4S, v20.4S, v14.4S -trn1 v12.4S, v15.4S, v2.4S -trn2 v21.4S, v15.4S, v2.4S -trn2 v15.2D, v19.2D, v12.2D -trn2 v2.2D, v18.2D, v21.2D -trn1 v20.2D, v19.2D, v12.2D -trn1 v14.2D, v18.2D, v21.2D -sqrdmulh v21.4S, v15.4S, v9.4S -mul v15.4S, v15.4S,v8.4S -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v9.4S -mul v2.4S, v2.4S,v8.4S -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v6.4S -mul v14.4S, v14.4S,v7.4S -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v16.4S -mul v15.4S, v15.4S,v5.4S -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -str q20, [x0, #128] -str q2, [x0, #144] -str q21, [x0, #160] -str q14, [x0, #176] -ldr q14, [x17, #+512] -ldr q21, [x17, #+528] -ldr q2, [x17, #+544] -ldr q20, [x17, #+560] -ldr q15, [x17, #+576] -ldr q18, [x17, #+592] -ldr q12, [x17, #+608] -ldr q19, [x17, #+624] -ldr q16, [x0, #224] -ldr q5, [x0, #240] -ldr q6, [x0, #192] -ldr q7, [x0, #208] -sqrdmulh v9.4S, v16.4S, v21.s[0] -mul v16.4S, v16.4S,v14.s[0] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v21.s[0] -mul v5.4S, v5.4S,v14.s[0] -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v21.s[1] -mul v7.4S, v7.4S,v14.s[1] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v16.4S, v21.s[2] -mul v16.4S, v16.4S,v14.s[2] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -trn1 v16.4S, v6.4S, v5.4S -trn2 v8.4S, v6.4S, v5.4S -trn1 v4.4S, v9.4S, v7.4S -trn2 v1.4S, v9.4S, v7.4S -trn2 v9.2D, v16.2D, v4.2D -trn2 v7.2D, v8.2D, v1.2D -trn1 v6.2D, v16.2D, v4.2D -trn1 v5.2D, v8.2D, v1.2D -sqrdmulh v1.4S, v9.4S, v20.4S -mul v9.4S, v9.4S,v2.4S -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v20.4S -mul v7.4S, v7.4S,v2.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v18.4S -mul v5.4S, v5.4S,v15.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v19.4S -mul v9.4S, v9.4S,v12.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -str q6, [x0, #192] -str q7, [x0, #208] -str q1, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q1, [x17, #+656] -ldr q7, [x17, #+672] -ldr q6, [x17, #+688] -ldr q9, [x17, #+704] -ldr q8, [x17, #+720] -ldr q4, [x17, #+736] -ldr q16, [x17, #+752] -ldr q19, [x0, #288] -ldr q12, [x0, #304] -ldr q18, [x0, #256] -ldr q15, [x0, #272] -sqrdmulh v20.4S, v19.4S, v1.s[0] -mul v19.4S, v19.4S,v5.s[0] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v1.s[0] -mul v12.4S, v12.4S,v5.s[0] -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v1.s[1] -mul v15.4S, v15.4S,v5.s[1] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v1.s[2] -mul v19.4S, v19.4S,v5.s[2] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -trn1 v19.4S, v18.4S, v12.4S -trn2 v2.4S, v18.4S, v12.4S -trn1 v21.4S, v20.4S, v15.4S -trn2 v14.4S, v20.4S, v15.4S -trn2 v20.2D, v19.2D, v21.2D -trn2 v15.2D, v2.2D, v14.2D -trn1 v18.2D, v19.2D, v21.2D -trn1 v12.2D, v2.2D, v14.2D -sqrdmulh v14.4S, v20.4S, v6.4S -mul v20.4S, v20.4S,v7.4S -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v6.4S -mul v15.4S, v15.4S,v7.4S -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v15.4s -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v8.4S -mul v12.4S, v12.4S,v9.4S -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v20.4S, v16.4S -mul v20.4S, v20.4S,v4.4S -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -str q18, [x0, #256] -str q15, [x0, #272] -str q14, [x0, #288] -str q12, [x0, #304] -ldr q12, [x17, #+768] -ldr q14, [x17, #+784] -ldr q15, [x17, #+800] -ldr q18, [x17, #+816] -ldr q20, [x17, #+832] -ldr q2, [x17, #+848] -ldr q21, [x17, #+864] -ldr q19, [x17, #+880] -ldr q16, [x0, #352] -ldr q4, [x0, #368] -ldr q8, [x0, #320] -ldr q9, [x0, #336] -sqrdmulh v6.4S, v16.4S, v14.s[0] -mul v16.4S, v16.4S,v12.s[0] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v4.4S, v14.s[0] -mul v4.4S, v4.4S,v12.s[0] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v4.4s -add v9.4s, v9.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v14.s[1] -mul v9.4S, v9.4S,v12.s[1] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v14.s[2] -mul v16.4S, v16.4S,v12.s[2] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -trn1 v16.4S, v8.4S, v4.4S -trn2 v7.4S, v8.4S, v4.4S -trn1 v1.4S, v6.4S, v9.4S -trn2 v5.4S, v6.4S, v9.4S -trn2 v6.2D, v16.2D, v1.2D -trn2 v9.2D, v7.2D, v5.2D -trn1 v8.2D, v16.2D, v1.2D -trn1 v4.2D, v7.2D, v5.2D -sqrdmulh v5.4S, v6.4S, v18.4S -mul v6.4S, v6.4S,v15.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v18.4S -mul v9.4S, v9.4S,v15.4S -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v9.4s -add v4.4s, v4.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v2.4S -mul v4.4S, v4.4S,v20.4S -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v19.4S -mul v6.4S, v6.4S,v21.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -str q8, [x0, #320] -str q9, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q9, [x17, #+928] -ldr q8, [x17, #+944] -ldr q6, [x17, #+960] -ldr q7, [x17, #+976] -ldr q1, [x17, #+992] -ldr q16, [x17, #+1008] -ldr q19, [x0, #416] -ldr q21, [x0, #432] -ldr q2, [x0, #384] -ldr q20, [x0, #400] -sqrdmulh v18.4S, v19.4S, v5.s[0] -mul v19.4S, v19.4S,v4.s[0] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v5.s[1] -mul v20.4S, v20.4S,v4.s[1] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v20.4s -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v5.s[2] -mul v19.4S, v19.4S,v4.s[2] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -trn1 v19.4S, v2.4S, v21.4S -trn2 v15.4S, v2.4S, v21.4S -trn1 v14.4S, v18.4S, v20.4S -trn2 v12.4S, v18.4S, v20.4S -trn2 v18.2D, v19.2D, v14.2D -trn2 v20.2D, v15.2D, v12.2D -trn1 v2.2D, v19.2D, v14.2D -trn1 v21.2D, v15.2D, v12.2D -sqrdmulh v12.4S, v18.4S, v8.4S -mul v18.4S, v18.4S,v9.4S -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v8.4S -mul v20.4S, v20.4S,v9.4S -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v16.4S -mul v18.4S, v18.4S,v1.4S -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -str q2, [x0, #384] -str q20, [x0, #400] -str q12, [x0, #416] -str q21, [x0, #432] -ldr q21, [x17, #+1024] -ldr q12, [x17, #+1040] -ldr q20, [x17, #+1056] -ldr q2, [x17, #+1072] -ldr q18, [x17, #+1088] -ldr q15, [x17, #+1104] -ldr q14, [x17, #+1120] -ldr q19, [x17, #+1136] -ldr q16, [x0, #480] -ldr q1, [x0, #496] -ldr q7, [x0, #448] -ldr q6, [x0, #464] -sqrdmulh v8.4S, v16.4S, v12.s[0] -mul v16.4S, v16.4S,v21.s[0] -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v12.s[0] -mul v1.4S, v1.4S,v21.s[0] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -sqrdmulh v1.4S, v6.4S, v12.s[1] -mul v6.4S, v6.4S,v21.s[1] -mla v6.4S, v1.4S, v31.s[0] -sub v1.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v12.s[2] -mul v16.4S, v16.4S,v21.s[2] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -trn1 v16.4S, v7.4S, v1.4S -trn2 v9.4S, v7.4S, v1.4S -trn1 v5.4S, v8.4S, v6.4S -trn2 v4.4S, v8.4S, v6.4S -trn2 v8.2D, v16.2D, v5.2D -trn2 v6.2D, v9.2D, v4.2D -trn1 v7.2D, v16.2D, v5.2D -trn1 v1.2D, v9.2D, v4.2D -sqrdmulh v4.4S, v8.4S, v2.4S -mul v8.4S, v8.4S,v20.4S -mla v8.4S, v4.4S, v31.s[0] -sub v4.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v2.4S -mul v6.4S, v6.4S,v20.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -sqrdmulh v6.4S, v1.4S, v15.4S -mul v1.4S, v1.4S,v18.4S -mla v1.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v1.4s -add v7.4s, v7.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v19.4S -mul v8.4S, v8.4S,v14.4S -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v8.4s -add v4.4s, v4.4s, v8.4s -str q7, [x0, #448] -str q6, [x0, #464] -str q4, [x0, #480] -str q1, [x0, #496] -ldr q1, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q7, [x17, #+1200] -ldr q8, [x17, #+1216] -ldr q9, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q16, [x17, #+1264] -ldr q19, [x0, #544] -ldr q14, [x0, #560] -ldr q15, [x0, #512] -ldr q18, [x0, #528] -sqrdmulh v2.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v1.s[0] -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v1.s[0] -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v4.s[1] -mul v18.4S, v18.4S,v1.s[1] -mla v18.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v4.s[2] -mul v19.4S, v19.4S,v1.s[2] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -trn1 v19.4S, v15.4S, v14.4S -trn2 v20.4S, v15.4S, v14.4S -trn1 v12.4S, v2.4S, v18.4S -trn2 v21.4S, v2.4S, v18.4S -trn2 v2.2D, v19.2D, v12.2D -trn2 v18.2D, v20.2D, v21.2D -trn1 v15.2D, v19.2D, v12.2D -trn1 v14.2D, v20.2D, v21.2D -sqrdmulh v21.4S, v2.4S, v7.4S -mul v2.4S, v2.4S,v6.4S -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v18.4S, v7.4S -mul v18.4S, v18.4S,v6.4S -mla v18.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v14.4S, v9.4S -mul v14.4S, v14.4S,v8.4S -mla v14.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v16.4S -mul v2.4S, v2.4S,v5.4S -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -str q15, [x0, #512] -str q18, [x0, #528] -str q21, [x0, #544] -str q14, [x0, #560] -ldr q14, [x17, #+1280] -ldr q21, [x17, #+1296] -ldr q18, [x17, #+1312] -ldr q15, [x17, #+1328] -ldr q2, [x17, #+1344] -ldr q20, [x17, #+1360] -ldr q12, [x17, #+1376] -ldr q19, [x17, #+1392] -ldr q16, [x0, #608] -ldr q5, [x0, #624] -ldr q9, [x0, #576] -ldr q8, [x0, #592] -sqrdmulh v7.4S, v16.4S, v21.s[0] -mul v16.4S, v16.4S,v14.s[0] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v21.s[0] -mul v5.4S, v5.4S,v14.s[0] -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v5.4s -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v21.s[1] -mul v8.4S, v8.4S,v14.s[1] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v21.s[2] -mul v16.4S, v16.4S,v14.s[2] -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -trn1 v16.4S, v9.4S, v5.4S -trn2 v6.4S, v9.4S, v5.4S -trn1 v4.4S, v7.4S, v8.4S -trn2 v1.4S, v7.4S, v8.4S -trn2 v7.2D, v16.2D, v4.2D -trn2 v8.2D, v6.2D, v1.2D -trn1 v9.2D, v16.2D, v4.2D -trn1 v5.2D, v6.2D, v1.2D -sqrdmulh v1.4S, v7.4S, v15.4S -mul v7.4S, v7.4S,v18.4S -mla v7.4S, v1.4S, v31.s[0] -sub v1.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v15.4S -mul v8.4S, v8.4S,v18.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v20.4S -mul v5.4S, v5.4S,v2.4S -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v19.4S -mul v7.4S, v7.4S,v12.4S -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v7.4s -add v1.4s, v1.4s, v7.4s -str q9, [x0, #576] -str q8, [x0, #592] -str q1, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q1, [x17, #+1424] -ldr q8, [x17, #+1440] -ldr q9, [x17, #+1456] -ldr q7, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q16, [x17, #+1520] -ldr q19, [x0, #672] -ldr q12, [x0, #688] -ldr q20, [x0, #640] -ldr q2, [x0, #656] -sqrdmulh v15.4S, v19.4S, v1.s[0] -mul v19.4S, v19.4S,v5.s[0] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v1.s[0] -mul v12.4S, v12.4S,v5.s[0] -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v12.4s -add v2.4s, v2.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v1.s[1] -mul v2.4S, v2.4S,v5.s[1] -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v1.s[2] -mul v19.4S, v19.4S,v5.s[2] -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -trn1 v19.4S, v20.4S, v12.4S -trn2 v18.4S, v20.4S, v12.4S -trn1 v21.4S, v15.4S, v2.4S -trn2 v14.4S, v15.4S, v2.4S -trn2 v15.2D, v19.2D, v21.2D -trn2 v2.2D, v18.2D, v14.2D -trn1 v20.2D, v19.2D, v21.2D -trn1 v12.2D, v18.2D, v14.2D -sqrdmulh v14.4S, v15.4S, v9.4S -mul v15.4S, v15.4S,v8.4S -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v9.4S -mul v2.4S, v2.4S,v8.4S -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v6.4S -mul v12.4S, v12.4S,v7.4S -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v16.4S -mul v15.4S, v15.4S,v4.4S -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -str q20, [x0, #640] -str q2, [x0, #656] -str q14, [x0, #672] -str q12, [x0, #688] -ldr q12, [x17, #+1536] -ldr q14, [x17, #+1552] -ldr q2, [x17, #+1568] -ldr q20, [x17, #+1584] -ldr q15, [x17, #+1600] -ldr q18, [x17, #+1616] -ldr q21, [x17, #+1632] -ldr q19, [x17, #+1648] -ldr q16, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q7, [x0, #720] -sqrdmulh v9.4S, v16.4S, v14.s[0] -mul v16.4S, v16.4S,v12.s[0] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v4.4S, v14.s[0] -mul v4.4S, v4.4S,v12.s[0] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v14.s[1] -mul v7.4S, v7.4S,v12.s[1] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v16.4S, v14.s[2] -mul v16.4S, v16.4S,v12.s[2] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -trn1 v16.4S, v6.4S, v4.4S -trn2 v8.4S, v6.4S, v4.4S -trn1 v1.4S, v9.4S, v7.4S -trn2 v5.4S, v9.4S, v7.4S -trn2 v9.2D, v16.2D, v1.2D -trn2 v7.2D, v8.2D, v5.2D -trn1 v6.2D, v16.2D, v1.2D -trn1 v4.2D, v8.2D, v5.2D -sqrdmulh v5.4S, v9.4S, v20.4S -mul v9.4S, v9.4S,v2.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v20.4S -mul v7.4S, v7.4S,v2.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v18.4S -mul v4.4S, v4.4S,v15.4S -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v19.4S -mul v9.4S, v9.4S,v21.4S -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -str q6, [x0, #704] -str q7, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q7, [x17, #+1696] -ldr q6, [x17, #+1712] -ldr q9, [x17, #+1728] -ldr q8, [x17, #+1744] -ldr q1, [x17, #+1760] -ldr q16, [x17, #+1776] -ldr q19, [x0, #800] -ldr q21, [x0, #816] -ldr q18, [x0, #768] -ldr q15, [x0, #784] -sqrdmulh v20.4S, v19.4S, v5.s[0] -mul v19.4S, v19.4S,v4.s[0] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v5.s[1] -mul v15.4S, v15.4S,v4.s[1] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v5.s[2] -mul v19.4S, v19.4S,v4.s[2] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -trn1 v19.4S, v18.4S, v21.4S -trn2 v2.4S, v18.4S, v21.4S -trn1 v14.4S, v20.4S, v15.4S -trn2 v12.4S, v20.4S, v15.4S -trn2 v20.2D, v19.2D, v14.2D -trn2 v15.2D, v2.2D, v12.2D -trn1 v18.2D, v19.2D, v14.2D -trn1 v21.2D, v2.2D, v12.2D -sqrdmulh v12.4S, v20.4S, v6.4S -mul v20.4S, v20.4S,v7.4S -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v6.4S -mul v15.4S, v15.4S,v7.4S -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v8.4S -mul v21.4S, v21.4S,v9.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v16.4S -mul v20.4S, v20.4S,v1.4S -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v12.4s, v20.4s -add v12.4s, v12.4s, v20.4s -str q18, [x0, #768] -str q15, [x0, #784] -str q12, [x0, #800] -str q21, [x0, #816] -ldr q21, [x17, #+1792] -ldr q12, [x17, #+1808] -ldr q15, [x17, #+1824] -ldr q18, [x17, #+1840] -ldr q20, [x17, #+1856] -ldr q2, [x17, #+1872] -ldr q14, [x17, #+1888] -ldr q19, [x17, #+1904] -ldr q16, [x0, #864] -ldr q1, [x0, #880] -ldr q8, [x0, #832] -ldr q9, [x0, #848] -sqrdmulh v6.4S, v16.4S, v12.s[0] -mul v16.4S, v16.4S,v21.s[0] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v12.s[0] -mul v1.4S, v1.4S,v21.s[0] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v1.4s -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v9.4S, v12.s[1] -mul v9.4S, v9.4S,v21.s[1] -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v12.s[2] -mul v16.4S, v16.4S,v21.s[2] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -trn1 v16.4S, v8.4S, v1.4S -trn2 v7.4S, v8.4S, v1.4S -trn1 v5.4S, v6.4S, v9.4S -trn2 v4.4S, v6.4S, v9.4S -trn2 v6.2D, v16.2D, v5.2D -trn2 v9.2D, v7.2D, v4.2D -trn1 v8.2D, v16.2D, v5.2D -trn1 v1.2D, v7.2D, v4.2D -sqrdmulh v4.4S, v6.4S, v18.4S -mul v6.4S, v6.4S,v15.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v18.4S -mul v9.4S, v9.4S,v15.4S -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -sqrdmulh v9.4S, v1.4S, v2.4S -mul v1.4S, v1.4S,v20.4S -mla v1.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -sqrdmulh v1.4S, v6.4S, v19.4S -mul v6.4S, v6.4S,v14.4S -mla v6.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -str q8, [x0, #832] -str q9, [x0, #848] -str q4, [x0, #864] -str q1, [x0, #880] -ldr q1, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q9, [x17, #+1952] -ldr q8, [x17, #+1968] -ldr q6, [x17, #+1984] -ldr q7, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q16, [x17, #+2032] -ldr q19, [x0, #928] -ldr q14, [x0, #944] -ldr q2, [x0, #896] -ldr q20, [x0, #912] -sqrdmulh v18.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v1.s[0] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v1.s[0] -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -sqrdmulh v14.4S, v20.4S, v4.s[1] -mul v20.4S, v20.4S,v1.s[1] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v20.4s -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v4.s[2] -mul v19.4S, v19.4S,v1.s[2] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -trn1 v19.4S, v2.4S, v14.4S -trn2 v15.4S, v2.4S, v14.4S -trn1 v12.4S, v18.4S, v20.4S -trn2 v21.4S, v18.4S, v20.4S -trn2 v18.2D, v19.2D, v12.2D -trn2 v20.2D, v15.2D, v21.2D -trn1 v2.2D, v19.2D, v12.2D -trn1 v14.2D, v15.2D, v21.2D -sqrdmulh v21.4S, v18.4S, v8.4S -mul v18.4S, v18.4S,v9.4S -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v8.4S -mul v20.4S, v20.4S,v9.4S -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v16.4S -mul v18.4S, v18.4S,v5.4S -mla v18.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -str q2, [x0, #896] -str q20, [x0, #912] -str q21, [x0, #928] -str q14, [x0, #944] -ldr q14, [x17, #+2048] -ldr q21, [x17, #+2064] -ldr q20, [x17, #+2080] -ldr q2, [x17, #+2096] -ldr q18, [x17, #+2112] -ldr q15, [x17, #+2128] -ldr q12, [x17, #+2144] -ldr q19, [x17, #+2160] -ldr q16, [x0, #992] -ldr q5, [x0, #1008] -ldr q7, [x0, #960] -ldr q6, [x0, #976] -sqrdmulh v8.4S, v16.4S, v21.s[0] -mul v16.4S, v16.4S,v14.s[0] -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v21.s[0] -mul v5.4S, v5.4S,v14.s[0] -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v21.s[1] -mul v6.4S, v6.4S,v14.s[1] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v21.s[2] -mul v16.4S, v16.4S,v14.s[2] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -trn1 v16.4S, v7.4S, v5.4S -trn2 v9.4S, v7.4S, v5.4S -trn1 v4.4S, v8.4S, v6.4S -trn2 v1.4S, v8.4S, v6.4S -trn2 v8.2D, v16.2D, v4.2D -trn2 v6.2D, v9.2D, v1.2D -trn1 v7.2D, v16.2D, v4.2D -trn1 v5.2D, v9.2D, v1.2D -sqrdmulh v1.4S, v8.4S, v2.4S -mul v8.4S, v8.4S,v20.4S -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v2.4S -mul v6.4S, v6.4S,v20.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v15.4S -mul v5.4S, v5.4S,v18.4S -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v19.4S -mul v8.4S, v8.4S,v12.4S -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v8.4s -add v1.4s, v1.4s, v8.4s -str q7, [x0, #960] -str q6, [x0, #976] -str q1, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s deleted file mode 100644 index d3538e3..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_0 -.global _ntt_u32_full_neon_asm_var_4_4_3_0 -ntt_u32_full_neon_asm_var_4_4_3_0: -_ntt_u32_full_neon_asm_var_4_4_3_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #16] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v13.4S -trn2 v0.4S, v11.4S, v13.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v17.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v21.2D, v0.2D, v17.2D -trn1 v11.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q11, [x0, #0] -str q21, [x0, #16] -str q17, [x0, #32] -str q13, [x0, #48] -ldr q13, [x17, #+256] -ldr q17, [x17, #+272] -ldr q21, [x17, #+288] -ldr q11, [x17, #+304] -ldr q14, [x17, #+320] -ldr q0, [x17, #+336] -ldr q19, [x17, #+352] -ldr q22, [x17, #+368] -ldr q16, [x0, #96] -ldr q2, [x0, #112] -ldr q10, [x0, #64] -ldr q15, [x0, #80] -sqrdmulh v7.4S, v16.4S, v17.s[0] -mul v16.4S, v16.4S,v13.s[0] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v2.4S, v17.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v15.4S, v17.s[1] -mul v15.4S, v15.4S,v13.s[1] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v17.s[2] -mul v16.4S, v16.4S,v13.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -trn1 v16.4S, v10.4S, v2.4S -trn2 v6.4S, v10.4S, v2.4S -trn1 v5.4S, v7.4S, v15.4S -trn2 v4.4S, v7.4S, v15.4S -trn2 v7.2D, v16.2D, v5.2D -trn2 v15.2D, v6.2D, v4.2D -trn1 v10.2D, v16.2D, v5.2D -trn1 v2.2D, v6.2D, v4.2D -sqrdmulh v4.4S, v7.4S, v11.4S -mul v7.4S, v7.4S,v21.4S -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v7.4s -add v10.4s, v10.4s, v7.4s -sqrdmulh v7.4S, v15.4S, v11.4S -mul v15.4S, v15.4S,v21.4S -mla v15.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v0.4S -mul v2.4S, v2.4S,v14.4S -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v22.4S -mul v7.4S, v7.4S,v19.4S -mla v7.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -str q10, [x0, #64] -str q15, [x0, #80] -str q4, [x0, #96] -str q2, [x0, #112] -ldr q2, [x17, #+384] -ldr q4, [x17, #+400] -ldr q15, [x17, #+416] -ldr q10, [x17, #+432] -ldr q7, [x17, #+448] -ldr q6, [x17, #+464] -ldr q5, [x17, #+480] -ldr q16, [x17, #+496] -ldr q22, [x0, #160] -ldr q19, [x0, #176] -ldr q0, [x0, #128] -ldr q14, [x0, #144] -sqrdmulh v11.4S, v22.4S, v4.s[0] -mul v22.4S, v22.4S,v2.s[0] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v2.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v4.s[1] -mul v14.4S, v14.4S,v2.s[1] -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v22.4S, v4.s[2] -mul v22.4S, v22.4S,v2.s[2] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -trn1 v22.4S, v0.4S, v19.4S -trn2 v21.4S, v0.4S, v19.4S -trn1 v17.4S, v11.4S, v14.4S -trn2 v13.4S, v11.4S, v14.4S -trn2 v11.2D, v22.2D, v17.2D -trn2 v14.2D, v21.2D, v13.2D -trn1 v0.2D, v22.2D, v17.2D -trn1 v19.2D, v21.2D, v13.2D -sqrdmulh v13.4S, v11.4S, v10.4S -mul v11.4S, v11.4S,v15.4S -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v10.4S -mul v14.4S, v14.4S,v15.4S -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v6.4S -mul v19.4S, v19.4S,v7.4S -mla v19.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v11.4S, v16.4S -mul v11.4S, v11.4S,v5.4S -mla v11.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -str q0, [x0, #128] -str q14, [x0, #144] -str q13, [x0, #160] -str q19, [x0, #176] -ldr q19, [x17, #+512] -ldr q13, [x17, #+528] -ldr q14, [x17, #+544] -ldr q0, [x17, #+560] -ldr q11, [x17, #+576] -ldr q21, [x17, #+592] -ldr q17, [x17, #+608] -ldr q22, [x17, #+624] -ldr q16, [x0, #224] -ldr q5, [x0, #240] -ldr q6, [x0, #192] -ldr q7, [x0, #208] -sqrdmulh v10.4S, v16.4S, v13.s[0] -mul v16.4S, v16.4S,v19.s[0] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v13.s[0] -mul v5.4S, v5.4S,v19.s[0] -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v13.s[1] -mul v7.4S, v7.4S,v19.s[1] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v16.4S, v13.s[2] -mul v16.4S, v16.4S,v19.s[2] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -trn1 v16.4S, v6.4S, v5.4S -trn2 v15.4S, v6.4S, v5.4S -trn1 v4.4S, v10.4S, v7.4S -trn2 v2.4S, v10.4S, v7.4S -trn2 v10.2D, v16.2D, v4.2D -trn2 v7.2D, v15.2D, v2.2D -trn1 v6.2D, v16.2D, v4.2D -trn1 v5.2D, v15.2D, v2.2D -sqrdmulh v2.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v14.4S -mla v10.4S, v2.4S, v31.s[0] -sub v2.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v7.4S, v0.4S -mul v7.4S, v7.4S,v14.4S -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v21.4S -mul v5.4S, v5.4S,v11.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v22.4S -mul v10.4S, v10.4S,v17.4S -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -str q6, [x0, #192] -str q7, [x0, #208] -str q2, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q2, [x17, #+656] -ldr q7, [x17, #+672] -ldr q6, [x17, #+688] -ldr q10, [x17, #+704] -ldr q15, [x17, #+720] -ldr q4, [x17, #+736] -ldr q16, [x17, #+752] -ldr q22, [x0, #288] -ldr q17, [x0, #304] -ldr q21, [x0, #256] -ldr q11, [x0, #272] -sqrdmulh v0.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v5.s[0] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v5.s[0] -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v5.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v5.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -trn1 v22.4S, v21.4S, v17.4S -trn2 v14.4S, v21.4S, v17.4S -trn1 v13.4S, v0.4S, v11.4S -trn2 v19.4S, v0.4S, v11.4S -trn2 v0.2D, v22.2D, v13.2D -trn2 v11.2D, v14.2D, v19.2D -trn1 v21.2D, v22.2D, v13.2D -trn1 v17.2D, v14.2D, v19.2D -sqrdmulh v19.4S, v0.4S, v6.4S -mul v0.4S, v0.4S,v7.4S -mla v0.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v0.4s -add v21.4s, v21.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v6.4S -mul v11.4S, v11.4S,v7.4S -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v15.4S -mul v17.4S, v17.4S,v10.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v16.4S -mul v0.4S, v0.4S,v4.4S -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -str q21, [x0, #256] -str q11, [x0, #272] -str q19, [x0, #288] -str q17, [x0, #304] -ldr q17, [x17, #+768] -ldr q19, [x17, #+784] -ldr q11, [x17, #+800] -ldr q21, [x17, #+816] -ldr q0, [x17, #+832] -ldr q14, [x17, #+848] -ldr q13, [x17, #+864] -ldr q22, [x17, #+880] -ldr q16, [x0, #352] -ldr q4, [x0, #368] -ldr q15, [x0, #320] -ldr q10, [x0, #336] -sqrdmulh v6.4S, v16.4S, v19.s[0] -mul v16.4S, v16.4S,v17.s[0] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -sqrdmulh v16.4S, v4.4S, v19.s[0] -mul v4.4S, v4.4S,v17.s[0] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v19.s[1] -mul v10.4S, v10.4S,v17.s[1] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v19.s[2] -mul v16.4S, v16.4S,v17.s[2] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -trn1 v16.4S, v15.4S, v4.4S -trn2 v7.4S, v15.4S, v4.4S -trn1 v2.4S, v6.4S, v10.4S -trn2 v5.4S, v6.4S, v10.4S -trn2 v6.2D, v16.2D, v2.2D -trn2 v10.2D, v7.2D, v5.2D -trn1 v15.2D, v16.2D, v2.2D -trn1 v4.2D, v7.2D, v5.2D -sqrdmulh v5.4S, v6.4S, v21.4S -mul v6.4S, v6.4S,v11.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v21.4S -mul v10.4S, v10.4S,v11.4S -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v10.4s -add v4.4s, v4.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v14.4S -mul v4.4S, v4.4S,v0.4S -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v22.4S -mul v6.4S, v6.4S,v13.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -str q15, [x0, #320] -str q10, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q10, [x17, #+928] -ldr q15, [x17, #+944] -ldr q6, [x17, #+960] -ldr q7, [x17, #+976] -ldr q2, [x17, #+992] -ldr q16, [x17, #+1008] -ldr q22, [x0, #416] -ldr q13, [x0, #432] -ldr q14, [x0, #384] -ldr q0, [x0, #400] -sqrdmulh v21.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v5.s[1] -mul v0.4S, v0.4S,v4.s[1] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -trn1 v22.4S, v14.4S, v13.4S -trn2 v11.4S, v14.4S, v13.4S -trn1 v19.4S, v21.4S, v0.4S -trn2 v17.4S, v21.4S, v0.4S -trn2 v21.2D, v22.2D, v19.2D -trn2 v0.2D, v11.2D, v17.2D -trn1 v14.2D, v22.2D, v19.2D -trn1 v13.2D, v11.2D, v17.2D -sqrdmulh v17.4S, v21.4S, v15.4S -mul v21.4S, v21.4S,v10.4S -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v15.4S -mul v0.4S, v0.4S,v10.4S -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v0.4s -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v7.4S -mul v13.4S, v13.4S,v6.4S -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v16.4S -mul v21.4S, v21.4S,v2.4S -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -str q14, [x0, #384] -str q0, [x0, #400] -str q17, [x0, #416] -str q13, [x0, #432] -ldr q13, [x17, #+1024] -ldr q17, [x17, #+1040] -ldr q0, [x17, #+1056] -ldr q14, [x17, #+1072] -ldr q21, [x17, #+1088] -ldr q11, [x17, #+1104] -ldr q19, [x17, #+1120] -ldr q22, [x17, #+1136] -ldr q16, [x0, #480] -ldr q2, [x0, #496] -ldr q7, [x0, #448] -ldr q6, [x0, #464] -sqrdmulh v15.4S, v16.4S, v17.s[0] -mul v16.4S, v16.4S,v13.s[0] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -sqrdmulh v16.4S, v2.4S, v17.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v6.4s, v2.4s -add v6.4s, v6.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v17.s[1] -mul v6.4S, v6.4S,v13.s[1] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v17.s[2] -mul v16.4S, v16.4S,v13.s[2] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -trn1 v16.4S, v7.4S, v2.4S -trn2 v10.4S, v7.4S, v2.4S -trn1 v5.4S, v15.4S, v6.4S -trn2 v4.4S, v15.4S, v6.4S -trn2 v15.2D, v16.2D, v5.2D -trn2 v6.2D, v10.2D, v4.2D -trn1 v7.2D, v16.2D, v5.2D -trn1 v2.2D, v10.2D, v4.2D -sqrdmulh v4.4S, v15.4S, v14.4S -mul v15.4S, v15.4S,v0.4S -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v7.4s, v15.4s -add v7.4s, v7.4s, v15.4s -sqrdmulh v15.4S, v6.4S, v14.4S -mul v6.4S, v6.4S,v0.4S -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v6.4s -add v2.4s, v2.4s, v6.4s -sqrdmulh v6.4S, v2.4S, v11.4S -mul v2.4S, v2.4S,v21.4S -mla v2.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v2.4s -add v7.4s, v7.4s, v2.4s -sqrdmulh v2.4S, v15.4S, v22.4S -mul v15.4S, v15.4S,v19.4S -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v15.4s -add v4.4s, v4.4s, v15.4s -str q7, [x0, #448] -str q6, [x0, #464] -str q4, [x0, #480] -str q2, [x0, #496] -ldr q2, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q7, [x17, #+1200] -ldr q15, [x17, #+1216] -ldr q10, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q16, [x17, #+1264] -ldr q22, [x0, #544] -ldr q19, [x0, #560] -ldr q11, [x0, #512] -ldr q21, [x0, #528] -sqrdmulh v14.4S, v22.4S, v4.s[0] -mul v22.4S, v22.4S,v2.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v2.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v4.s[1] -mul v21.4S, v21.4S,v2.s[1] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v4.s[2] -mul v22.4S, v22.4S,v2.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v19.4S -trn2 v0.4S, v11.4S, v19.4S -trn1 v17.4S, v14.4S, v21.4S -trn2 v13.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v17.2D -trn2 v21.2D, v0.2D, v13.2D -trn1 v11.2D, v22.2D, v17.2D -trn1 v19.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v10.4S -mul v19.4S, v19.4S,v15.4S -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q11, [x0, #512] -str q21, [x0, #528] -str q13, [x0, #544] -str q19, [x0, #560] -ldr q19, [x17, #+1280] -ldr q13, [x17, #+1296] -ldr q21, [x17, #+1312] -ldr q11, [x17, #+1328] -ldr q14, [x17, #+1344] -ldr q0, [x17, #+1360] -ldr q17, [x17, #+1376] -ldr q22, [x17, #+1392] -ldr q16, [x0, #608] -ldr q5, [x0, #624] -ldr q10, [x0, #576] -ldr q15, [x0, #592] -sqrdmulh v7.4S, v16.4S, v13.s[0] -mul v16.4S, v16.4S,v19.s[0] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v13.s[0] -mul v5.4S, v5.4S,v19.s[0] -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v13.s[1] -mul v15.4S, v15.4S,v19.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v13.s[2] -mul v16.4S, v16.4S,v19.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -trn1 v16.4S, v10.4S, v5.4S -trn2 v6.4S, v10.4S, v5.4S -trn1 v4.4S, v7.4S, v15.4S -trn2 v2.4S, v7.4S, v15.4S -trn2 v7.2D, v16.2D, v4.2D -trn2 v15.2D, v6.2D, v2.2D -trn1 v10.2D, v16.2D, v4.2D -trn1 v5.2D, v6.2D, v2.2D -sqrdmulh v2.4S, v7.4S, v11.4S -mul v7.4S, v7.4S,v21.4S -mla v7.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v7.4s -add v10.4s, v10.4s, v7.4s -sqrdmulh v7.4S, v15.4S, v11.4S -mul v15.4S, v15.4S,v21.4S -mla v15.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v15.4s -add v5.4s, v5.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v0.4S -mul v5.4S, v5.4S,v14.4S -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v22.4S -mul v7.4S, v7.4S,v17.4S -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -str q10, [x0, #576] -str q15, [x0, #592] -str q2, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q2, [x17, #+1424] -ldr q15, [x17, #+1440] -ldr q10, [x17, #+1456] -ldr q7, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q16, [x17, #+1520] -ldr q22, [x0, #672] -ldr q17, [x0, #688] -ldr q0, [x0, #640] -ldr q14, [x0, #656] -sqrdmulh v11.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v5.s[0] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v5.s[0] -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v2.s[1] -mul v14.4S, v14.4S,v5.s[1] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v5.s[2] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -trn1 v22.4S, v0.4S, v17.4S -trn2 v21.4S, v0.4S, v17.4S -trn1 v13.4S, v11.4S, v14.4S -trn2 v19.4S, v11.4S, v14.4S -trn2 v11.2D, v22.2D, v13.2D -trn2 v14.2D, v21.2D, v19.2D -trn1 v0.2D, v22.2D, v13.2D -trn1 v17.2D, v21.2D, v19.2D -sqrdmulh v19.4S, v11.4S, v10.4S -mul v11.4S, v11.4S,v15.4S -mla v11.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v10.4S -mul v14.4S, v14.4S,v15.4S -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v16.4S -mul v11.4S, v11.4S,v4.4S -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v11.4s -add v19.4s, v19.4s, v11.4s -str q0, [x0, #640] -str q14, [x0, #656] -str q19, [x0, #672] -str q17, [x0, #688] -ldr q17, [x17, #+1536] -ldr q19, [x17, #+1552] -ldr q14, [x17, #+1568] -ldr q0, [x17, #+1584] -ldr q11, [x17, #+1600] -ldr q21, [x17, #+1616] -ldr q13, [x17, #+1632] -ldr q22, [x17, #+1648] -ldr q16, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q7, [x0, #720] -sqrdmulh v10.4S, v16.4S, v19.s[0] -mul v16.4S, v16.4S,v17.s[0] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v4.4S, v19.s[0] -mul v4.4S, v4.4S,v17.s[0] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v19.s[1] -mul v7.4S, v7.4S,v17.s[1] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v16.4S, v19.s[2] -mul v16.4S, v16.4S,v17.s[2] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -trn1 v16.4S, v6.4S, v4.4S -trn2 v15.4S, v6.4S, v4.4S -trn1 v2.4S, v10.4S, v7.4S -trn2 v5.4S, v10.4S, v7.4S -trn2 v10.2D, v16.2D, v2.2D -trn2 v7.2D, v15.2D, v5.2D -trn1 v6.2D, v16.2D, v2.2D -trn1 v4.2D, v15.2D, v5.2D -sqrdmulh v5.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v14.4S -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v7.4S, v0.4S -mul v7.4S, v7.4S,v14.4S -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v21.4S -mul v4.4S, v4.4S,v11.4S -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v22.4S -mul v10.4S, v10.4S,v13.4S -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -str q6, [x0, #704] -str q7, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q7, [x17, #+1696] -ldr q6, [x17, #+1712] -ldr q10, [x17, #+1728] -ldr q15, [x17, #+1744] -ldr q2, [x17, #+1760] -ldr q16, [x17, #+1776] -ldr q22, [x0, #800] -ldr q13, [x0, #816] -ldr q21, [x0, #768] -ldr q11, [x0, #784] -sqrdmulh v0.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v5.s[1] -mul v11.4S, v11.4S,v4.s[1] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -trn1 v22.4S, v21.4S, v13.4S -trn2 v14.4S, v21.4S, v13.4S -trn1 v19.4S, v0.4S, v11.4S -trn2 v17.4S, v0.4S, v11.4S -trn2 v0.2D, v22.2D, v19.2D -trn2 v11.2D, v14.2D, v17.2D -trn1 v21.2D, v22.2D, v19.2D -trn1 v13.2D, v14.2D, v17.2D -sqrdmulh v17.4S, v0.4S, v6.4S -mul v0.4S, v0.4S,v7.4S -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v0.4s -add v21.4s, v21.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v6.4S -mul v11.4S, v11.4S,v7.4S -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v15.4S -mul v13.4S, v13.4S,v10.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v16.4S -mul v0.4S, v0.4S,v2.4S -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -str q21, [x0, #768] -str q11, [x0, #784] -str q17, [x0, #800] -str q13, [x0, #816] -ldr q13, [x17, #+1792] -ldr q17, [x17, #+1808] -ldr q11, [x17, #+1824] -ldr q21, [x17, #+1840] -ldr q0, [x17, #+1856] -ldr q14, [x17, #+1872] -ldr q19, [x17, #+1888] -ldr q22, [x17, #+1904] -ldr q16, [x0, #864] -ldr q2, [x0, #880] -ldr q15, [x0, #832] -ldr q10, [x0, #848] -sqrdmulh v6.4S, v16.4S, v17.s[0] -mul v16.4S, v16.4S,v13.s[0] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -sqrdmulh v16.4S, v2.4S, v17.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v17.s[1] -mul v10.4S, v10.4S,v13.s[1] -mla v10.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v17.s[2] -mul v16.4S, v16.4S,v13.s[2] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -trn1 v16.4S, v15.4S, v2.4S -trn2 v7.4S, v15.4S, v2.4S -trn1 v5.4S, v6.4S, v10.4S -trn2 v4.4S, v6.4S, v10.4S -trn2 v6.2D, v16.2D, v5.2D -trn2 v10.2D, v7.2D, v4.2D -trn1 v15.2D, v16.2D, v5.2D -trn1 v2.2D, v7.2D, v4.2D -sqrdmulh v4.4S, v6.4S, v21.4S -mul v6.4S, v6.4S,v11.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v21.4S -mul v10.4S, v10.4S,v11.4S -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v14.4S -mul v2.4S, v2.4S,v0.4S -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v22.4S -mul v6.4S, v6.4S,v19.4S -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -str q15, [x0, #832] -str q10, [x0, #848] -str q4, [x0, #864] -str q2, [x0, #880] -ldr q2, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q10, [x17, #+1952] -ldr q15, [x17, #+1968] -ldr q6, [x17, #+1984] -ldr q7, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q16, [x17, #+2032] -ldr q22, [x0, #928] -ldr q19, [x0, #944] -ldr q14, [x0, #896] -ldr q0, [x0, #912] -sqrdmulh v21.4S, v22.4S, v4.s[0] -mul v22.4S, v22.4S,v2.s[0] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v2.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v0.4S, v4.s[1] -mul v0.4S, v0.4S,v2.s[1] -mla v0.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v4.s[2] -mul v22.4S, v22.4S,v2.s[2] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -trn1 v22.4S, v14.4S, v19.4S -trn2 v11.4S, v14.4S, v19.4S -trn1 v17.4S, v21.4S, v0.4S -trn2 v13.4S, v21.4S, v0.4S -trn2 v21.2D, v22.2D, v17.2D -trn2 v0.2D, v11.2D, v13.2D -trn1 v14.2D, v22.2D, v17.2D -trn1 v19.2D, v11.2D, v13.2D -sqrdmulh v13.4S, v21.4S, v15.4S -mul v21.4S, v21.4S,v10.4S -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v15.4S -mul v0.4S, v0.4S,v10.4S -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v7.4S -mul v19.4S, v19.4S,v6.4S -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v16.4S -mul v21.4S, v21.4S,v5.4S -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -str q14, [x0, #896] -str q0, [x0, #912] -str q13, [x0, #928] -str q19, [x0, #944] -ldr q19, [x17, #+2048] -ldr q13, [x17, #+2064] -ldr q0, [x17, #+2080] -ldr q14, [x17, #+2096] -ldr q21, [x17, #+2112] -ldr q11, [x17, #+2128] -ldr q17, [x17, #+2144] -ldr q22, [x17, #+2160] -ldr q16, [x0, #992] -ldr q5, [x0, #1008] -ldr q7, [x0, #960] -ldr q6, [x0, #976] -sqrdmulh v15.4S, v16.4S, v13.s[0] -mul v16.4S, v16.4S,v19.s[0] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v13.s[0] -mul v5.4S, v5.4S,v19.s[0] -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v13.s[1] -mul v6.4S, v6.4S,v19.s[1] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v13.s[2] -mul v16.4S, v16.4S,v19.s[2] -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -trn1 v16.4S, v7.4S, v5.4S -trn2 v10.4S, v7.4S, v5.4S -trn1 v4.4S, v15.4S, v6.4S -trn2 v2.4S, v15.4S, v6.4S -trn2 v15.2D, v16.2D, v4.2D -trn2 v6.2D, v10.2D, v2.2D -trn1 v7.2D, v16.2D, v4.2D -trn1 v5.2D, v10.2D, v2.2D -sqrdmulh v2.4S, v15.4S, v14.4S -mul v15.4S, v15.4S,v0.4S -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v7.4s, v15.4s -add v7.4s, v7.4s, v15.4s -sqrdmulh v15.4S, v6.4S, v14.4S -mul v6.4S, v6.4S,v0.4S -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v11.4S -mul v5.4S, v5.4S,v21.4S -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v22.4S -mul v15.4S, v15.4S,v17.4S -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -str q7, [x0, #960] -str q6, [x0, #976] -str q2, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s deleted file mode 100644 index dadc45d..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z2_0 -.global _ntt_u32_full_neon_asm_var_4_4_3_z2_0 -ntt_u32_full_neon_asm_var_4_4_3_z2_0: -_ntt_u32_full_neon_asm_var_4_4_3_z2_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #16] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v13.4S -trn2 v0.4S, v11.4S, v13.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v17.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v21.2D, v0.2D, v17.2D -trn1 v11.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q11, [x0, #0] -str q21, [x0, #16] -str q17, [x0, #32] -str q13, [x0, #48] -ldr q16, [x17, #+1152] -ldr q2, [x17, #+1168] -ldr q10, [x17, #+1184] -ldr q15, [x17, #+1200] -ldr q7, [x17, #+1216] -ldr q6, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q4, [x17, #+1264] -ldr q13, [x0, #544] -ldr q17, [x0, #560] -ldr q21, [x0, #512] -ldr q11, [x0, #528] -sqrdmulh v14.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v2.s[2] -mul v13.4S, v13.4S,v16.s[2] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v21.4S, v17.4S -trn2 v0.4S, v21.4S, v17.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v22.4S, v14.4S, v11.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v11.2D, v0.2D, v22.2D -trn1 v21.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q21, [x0, #512] -str q11, [x0, #528] -str q22, [x0, #544] -str q17, [x0, #560] -ldr q4, [x17, #+256] -ldr q5, [x17, #+272] -ldr q6, [x17, #+288] -ldr q7, [x17, #+304] -ldr q15, [x17, #+320] -ldr q10, [x17, #+336] -ldr q2, [x17, #+352] -ldr q16, [x17, #+368] -ldr q17, [x0, #96] -ldr q22, [x0, #112] -ldr q11, [x0, #64] -ldr q21, [x0, #80] -sqrdmulh v14.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v11.4S, v22.4S -trn2 v0.4S, v11.4S, v22.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v13.4S, v14.4S, v21.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v21.2D, v0.2D, v13.2D -trn1 v11.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v15.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q11, [x0, #64] -str q21, [x0, #80] -str q13, [x0, #96] -str q22, [x0, #112] -ldr q16, [x17, #+1280] -ldr q2, [x17, #+1296] -ldr q10, [x17, #+1312] -ldr q15, [x17, #+1328] -ldr q7, [x17, #+1344] -ldr q6, [x17, #+1360] -ldr q5, [x17, #+1376] -ldr q4, [x17, #+1392] -ldr q22, [x0, #608] -ldr q13, [x0, #624] -ldr q21, [x0, #576] -ldr q11, [x0, #592] -sqrdmulh v14.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v16.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v21.4S, v13.4S -trn2 v0.4S, v21.4S, v13.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v17.4S, v14.4S, v11.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v11.2D, v0.2D, v17.2D -trn1 v21.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v6.4S -mul v13.4S, v13.4S,v7.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q21, [x0, #576] -str q11, [x0, #592] -str q17, [x0, #608] -str q13, [x0, #624] -ldr q4, [x17, #+384] -ldr q5, [x17, #+400] -ldr q6, [x17, #+416] -ldr q7, [x17, #+432] -ldr q15, [x17, #+448] -ldr q10, [x17, #+464] -ldr q2, [x17, #+480] -ldr q16, [x17, #+496] -ldr q13, [x0, #160] -ldr q17, [x0, #176] -ldr q11, [x0, #128] -ldr q21, [x0, #144] -sqrdmulh v14.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v11.4S, v17.4S -trn2 v0.4S, v11.4S, v17.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v22.4S, v14.4S, v21.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v21.2D, v0.2D, v22.2D -trn1 v11.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v15.4S -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q11, [x0, #128] -str q21, [x0, #144] -str q22, [x0, #160] -str q17, [x0, #176] -ldr q16, [x17, #+1408] -ldr q2, [x17, #+1424] -ldr q10, [x17, #+1440] -ldr q15, [x17, #+1456] -ldr q7, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q5, [x17, #+1504] -ldr q4, [x17, #+1520] -ldr q17, [x0, #672] -ldr q22, [x0, #688] -ldr q21, [x0, #640] -ldr q11, [x0, #656] -sqrdmulh v14.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v2.s[2] -mul v17.4S, v17.4S,v16.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v21.4S, v22.4S -trn2 v0.4S, v21.4S, v22.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v13.4S, v14.4S, v11.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v11.2D, v0.2D, v13.2D -trn1 v21.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q21, [x0, #640] -str q11, [x0, #656] -str q13, [x0, #672] -str q22, [x0, #688] -ldr q4, [x17, #+512] -ldr q5, [x17, #+528] -ldr q6, [x17, #+544] -ldr q7, [x17, #+560] -ldr q15, [x17, #+576] -ldr q10, [x17, #+592] -ldr q2, [x17, #+608] -ldr q16, [x17, #+624] -ldr q22, [x0, #224] -ldr q13, [x0, #240] -ldr q11, [x0, #192] -ldr q21, [x0, #208] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v13.4S -trn2 v0.4S, v11.4S, v13.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v17.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v21.2D, v0.2D, v17.2D -trn1 v11.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q11, [x0, #192] -str q21, [x0, #208] -str q17, [x0, #224] -str q13, [x0, #240] -ldr q16, [x17, #+1536] -ldr q2, [x17, #+1552] -ldr q10, [x17, #+1568] -ldr q15, [x17, #+1584] -ldr q7, [x17, #+1600] -ldr q6, [x17, #+1616] -ldr q5, [x17, #+1632] -ldr q4, [x17, #+1648] -ldr q13, [x0, #736] -ldr q17, [x0, #752] -ldr q21, [x0, #704] -ldr q11, [x0, #720] -sqrdmulh v14.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v2.s[2] -mul v13.4S, v13.4S,v16.s[2] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v21.4S, v17.4S -trn2 v0.4S, v21.4S, v17.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v22.4S, v14.4S, v11.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v11.2D, v0.2D, v22.2D -trn1 v21.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q21, [x0, #704] -str q11, [x0, #720] -str q22, [x0, #736] -str q17, [x0, #752] -ldr q4, [x17, #+640] -ldr q5, [x17, #+656] -ldr q6, [x17, #+672] -ldr q7, [x17, #+688] -ldr q15, [x17, #+704] -ldr q10, [x17, #+720] -ldr q2, [x17, #+736] -ldr q16, [x17, #+752] -ldr q17, [x0, #288] -ldr q22, [x0, #304] -ldr q11, [x0, #256] -ldr q21, [x0, #272] -sqrdmulh v14.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v11.4S, v22.4S -trn2 v0.4S, v11.4S, v22.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v13.4S, v14.4S, v21.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v21.2D, v0.2D, v13.2D -trn1 v11.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v15.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q11, [x0, #256] -str q21, [x0, #272] -str q13, [x0, #288] -str q22, [x0, #304] -ldr q16, [x17, #+1664] -ldr q2, [x17, #+1680] -ldr q10, [x17, #+1696] -ldr q15, [x17, #+1712] -ldr q7, [x17, #+1728] -ldr q6, [x17, #+1744] -ldr q5, [x17, #+1760] -ldr q4, [x17, #+1776] -ldr q22, [x0, #800] -ldr q13, [x0, #816] -ldr q21, [x0, #768] -ldr q11, [x0, #784] -sqrdmulh v14.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v16.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v21.4S, v13.4S -trn2 v0.4S, v21.4S, v13.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v17.4S, v14.4S, v11.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v11.2D, v0.2D, v17.2D -trn1 v21.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v6.4S -mul v13.4S, v13.4S,v7.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q21, [x0, #768] -str q11, [x0, #784] -str q17, [x0, #800] -str q13, [x0, #816] -ldr q4, [x17, #+768] -ldr q5, [x17, #+784] -ldr q6, [x17, #+800] -ldr q7, [x17, #+816] -ldr q15, [x17, #+832] -ldr q10, [x17, #+848] -ldr q2, [x17, #+864] -ldr q16, [x17, #+880] -ldr q13, [x0, #352] -ldr q17, [x0, #368] -ldr q11, [x0, #320] -ldr q21, [x0, #336] -sqrdmulh v14.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v11.4S, v17.4S -trn2 v0.4S, v11.4S, v17.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v22.4S, v14.4S, v21.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v21.2D, v0.2D, v22.2D -trn1 v11.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v15.4S -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q11, [x0, #320] -str q21, [x0, #336] -str q22, [x0, #352] -str q17, [x0, #368] -ldr q16, [x17, #+1792] -ldr q2, [x17, #+1808] -ldr q10, [x17, #+1824] -ldr q15, [x17, #+1840] -ldr q7, [x17, #+1856] -ldr q6, [x17, #+1872] -ldr q5, [x17, #+1888] -ldr q4, [x17, #+1904] -ldr q17, [x0, #864] -ldr q22, [x0, #880] -ldr q21, [x0, #832] -ldr q11, [x0, #848] -sqrdmulh v14.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v2.s[2] -mul v17.4S, v17.4S,v16.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v21.4S, v22.4S -trn2 v0.4S, v21.4S, v22.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v13.4S, v14.4S, v11.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v11.2D, v0.2D, v13.2D -trn1 v21.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q21, [x0, #832] -str q11, [x0, #848] -str q13, [x0, #864] -str q22, [x0, #880] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q6, [x17, #+928] -ldr q7, [x17, #+944] -ldr q15, [x17, #+960] -ldr q10, [x17, #+976] -ldr q2, [x17, #+992] -ldr q16, [x17, #+1008] -ldr q22, [x0, #416] -ldr q13, [x0, #432] -ldr q11, [x0, #384] -ldr q21, [x0, #400] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v13.4S -trn2 v0.4S, v11.4S, v13.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v17.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v21.2D, v0.2D, v17.2D -trn1 v11.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q11, [x0, #384] -str q21, [x0, #400] -str q17, [x0, #416] -str q13, [x0, #432] -ldr q16, [x17, #+1920] -ldr q2, [x17, #+1936] -ldr q10, [x17, #+1952] -ldr q15, [x17, #+1968] -ldr q7, [x17, #+1984] -ldr q6, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q4, [x17, #+2032] -ldr q13, [x0, #928] -ldr q17, [x0, #944] -ldr q21, [x0, #896] -ldr q11, [x0, #912] -sqrdmulh v14.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v2.s[2] -mul v13.4S, v13.4S,v16.s[2] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v21.4S, v17.4S -trn2 v0.4S, v21.4S, v17.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v22.4S, v14.4S, v11.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v11.2D, v0.2D, v22.2D -trn1 v21.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q21, [x0, #896] -str q11, [x0, #912] -str q22, [x0, #928] -str q17, [x0, #944] -ldr q4, [x17, #+1024] -ldr q5, [x17, #+1040] -ldr q6, [x17, #+1056] -ldr q7, [x17, #+1072] -ldr q15, [x17, #+1088] -ldr q10, [x17, #+1104] -ldr q2, [x17, #+1120] -ldr q16, [x17, #+1136] -ldr q17, [x0, #480] -ldr q22, [x0, #496] -ldr q11, [x0, #448] -ldr q21, [x0, #464] -sqrdmulh v14.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v11.4S, v22.4S -trn2 v0.4S, v11.4S, v22.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v13.4S, v14.4S, v21.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v21.2D, v0.2D, v13.2D -trn1 v11.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v15.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q11, [x0, #448] -str q21, [x0, #464] -str q13, [x0, #480] -str q22, [x0, #496] -ldr q16, [x17, #+2048] -ldr q2, [x17, #+2064] -ldr q10, [x17, #+2080] -ldr q15, [x17, #+2096] -ldr q7, [x17, #+2112] -ldr q6, [x17, #+2128] -ldr q5, [x17, #+2144] -ldr q4, [x17, #+2160] -ldr q22, [x0, #992] -ldr q13, [x0, #1008] -ldr q21, [x0, #960] -ldr q11, [x0, #976] -sqrdmulh v14.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v16.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v21.4S, v13.4S -trn2 v0.4S, v21.4S, v13.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v17.4S, v14.4S, v11.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v11.2D, v0.2D, v17.2D -trn1 v21.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v6.4S -mul v13.4S, v13.4S,v7.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q21, [x0, #960] -str q11, [x0, #976] -str q17, [x0, #992] -str q13, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s deleted file mode 100644 index e51be66..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_1.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z2_1 -.global _ntt_u32_full_neon_asm_var_4_4_3_z2_1 -ntt_u32_full_neon_asm_var_4_4_3_z2_1: -_ntt_u32_full_neon_asm_var_4_4_3_z2_1: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #16] -ldr q14, [x17, #+1152] -ldr q0, [x17, #+1168] -ldr q19, [x17, #+1184] -ldr q17, [x17, #+1200] -ldr q20, [x17, #+1216] -ldr q3, [x17, #+1232] -ldr q1, [x17, #+1248] -ldr q9, [x17, #+1264] -ldr q12, [x0, #544] -ldr q8, [x0, #560] -ldr q18, [x0, #512] -ldr q30, [x0, #528] -sqrdmulh v29.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v14.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v0.s[0] -mul v8.4S, v8.4S,v14.s[0] -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v0.s[1] -mul v30.4S, v30.4S,v14.s[1] -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v30.4s -add v18.4s, v18.4s, v30.4s -sqrdmulh v30.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v12.4s -add v29.4s, v29.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v0.s[2] -mul v13.4S, v13.4S,v14.s[2] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -trn1 v13.4S, v11.4S, v8.4S -trn2 v28.4S, v11.4S, v8.4S -trn1 v27.4S, v29.4S, v30.4S -trn2 v26.4S, v29.4S, v30.4S -trn2 v29.2D, v13.2D, v27.2D -trn2 v30.2D, v28.2D, v26.2D -trn1 v11.2D, v13.2D, v27.2D -trn1 v8.2D, v28.2D, v26.2D -trn1 v26.4S, v18.4S, v21.4S -trn2 v28.4S, v18.4S, v21.4S -trn1 v27.4S, v22.4S, v12.4S -trn2 v13.4S, v22.4S, v12.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v12.2D, v28.2D, v13.2D -trn1 v18.2D, v26.2D, v27.2D -trn1 v21.2D, v28.2D, v13.2D -sqrdmulh v13.4S, v29.4S, v7.4S -mul v29.4S, v29.4S,v6.4S -mla v29.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v17.4S -mul v22.4S, v22.4S,v19.4S -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v7.4S -mul v30.4S, v30.4S,v6.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v12.4S, v17.4S -mul v12.4S, v12.4S,v19.4S -mla v12.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v10.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v3.4S -mul v21.4S, v21.4S,v20.4S -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v2.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v9.4S -mul v30.4S, v30.4S,v1.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -str q11, [x0, #0] -str q12, [x0, #16] -str q13, [x0, #32] -str q21, [x0, #48] -str q18, [x0, #512] -str q8, [x0, #528] -str q29, [x0, #544] -str q22, [x0, #560] -ldr q9, [x17, #+256] -ldr q1, [x17, #+272] -ldr q3, [x17, #+288] -ldr q20, [x17, #+304] -ldr q17, [x17, #+320] -ldr q19, [x17, #+336] -ldr q0, [x17, #+352] -ldr q14, [x17, #+368] -ldr q22, [x0, #96] -ldr q29, [x0, #112] -ldr q8, [x0, #64] -ldr q18, [x0, #80] -ldr q16, [x17, #+1280] -ldr q2, [x17, #+1296] -ldr q10, [x17, #+1312] -ldr q15, [x17, #+1328] -ldr q7, [x17, #+1344] -ldr q6, [x17, #+1360] -ldr q5, [x17, #+1376] -ldr q4, [x17, #+1392] -ldr q21, [x0, #608] -ldr q13, [x0, #624] -ldr q12, [x0, #576] -ldr q11, [x0, #592] -sqrdmulh v30.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v22.4s -add v8.4s, v8.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v2.s[0] -mul v21.4S, v21.4S,v16.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v29.4S, v1.s[0] -mul v29.4S, v29.4S,v9.s[0] -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v29.4s -add v18.4s, v18.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v1.s[1] -mul v18.4S, v18.4S,v9.s[1] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v1.s[2] -mul v21.4S, v21.4S,v9.s[2] -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v21.4s -add v30.4s, v30.4s, v21.4s -sqrdmulh v21.4S, v29.4S, v2.s[2] -mul v29.4S, v29.4S,v16.s[2] -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v29.4s -add v22.4s, v22.4s, v29.4s -trn1 v29.4S, v8.4S, v13.4S -trn2 v28.4S, v8.4S, v13.4S -trn1 v27.4S, v30.4S, v11.4S -trn2 v26.4S, v30.4S, v11.4S -trn2 v30.2D, v29.2D, v27.2D -trn2 v11.2D, v28.2D, v26.2D -trn1 v8.2D, v29.2D, v27.2D -trn1 v13.2D, v28.2D, v26.2D -trn1 v26.4S, v12.4S, v18.4S -trn2 v28.4S, v12.4S, v18.4S -trn1 v27.4S, v22.4S, v21.4S -trn2 v29.4S, v22.4S, v21.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v21.2D, v28.2D, v29.2D -trn1 v12.2D, v26.2D, v27.2D -trn1 v18.2D, v28.2D, v29.2D -sqrdmulh v29.4S, v30.4S, v20.4S -mul v30.4S, v30.4S,v3.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v15.4S -mul v22.4S, v22.4S,v10.4S -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v20.4S -mul v11.4S, v11.4S,v3.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v15.4S -mul v21.4S, v21.4S,v10.4S -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v19.4S -mul v13.4S, v13.4S,v17.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v6.4S -mul v18.4S, v18.4S,v7.4S -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v14.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v4.4S -mul v11.4S, v11.4S,v5.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v11.4s -add v30.4s, v30.4s, v11.4s -str q8, [x0, #64] -str q21, [x0, #80] -str q29, [x0, #96] -str q18, [x0, #112] -str q12, [x0, #576] -str q13, [x0, #592] -str q30, [x0, #608] -str q22, [x0, #624] -ldr q4, [x17, #+384] -ldr q5, [x17, #+400] -ldr q6, [x17, #+416] -ldr q7, [x17, #+432] -ldr q15, [x17, #+448] -ldr q10, [x17, #+464] -ldr q2, [x17, #+480] -ldr q16, [x17, #+496] -ldr q22, [x0, #160] -ldr q30, [x0, #176] -ldr q13, [x0, #128] -ldr q12, [x0, #144] -ldr q14, [x17, #+1408] -ldr q0, [x17, #+1424] -ldr q19, [x17, #+1440] -ldr q17, [x17, #+1456] -ldr q20, [x17, #+1472] -ldr q3, [x17, #+1488] -ldr q1, [x17, #+1504] -ldr q9, [x17, #+1520] -ldr q18, [x0, #672] -ldr q29, [x0, #688] -ldr q21, [x0, #640] -ldr q8, [x0, #656] -sqrdmulh v11.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v0.s[0] -mul v18.4S, v18.4S,v14.s[0] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v5.s[0] -mul v30.4S, v30.4S,v4.s[0] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v30.4s -add v12.4s, v12.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v0.s[0] -mul v29.4S, v29.4S,v14.s[0] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -sqrdmulh v29.4S, v12.4S, v5.s[1] -mul v12.4S, v12.4S,v4.s[1] -mla v12.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v0.s[1] -mul v8.4S, v8.4S,v14.s[1] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v5.s[2] -mul v18.4S, v18.4S,v4.s[2] -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v0.s[2] -mul v30.4S, v30.4S,v14.s[2] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -trn1 v30.4S, v13.4S, v29.4S -trn2 v28.4S, v13.4S, v29.4S -trn1 v27.4S, v11.4S, v8.4S -trn2 v26.4S, v11.4S, v8.4S -trn2 v11.2D, v30.2D, v27.2D -trn2 v8.2D, v28.2D, v26.2D -trn1 v13.2D, v30.2D, v27.2D -trn1 v29.2D, v28.2D, v26.2D -trn1 v26.4S, v21.4S, v12.4S -trn2 v28.4S, v21.4S, v12.4S -trn1 v27.4S, v22.4S, v18.4S -trn2 v30.4S, v22.4S, v18.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v18.2D, v28.2D, v30.2D -trn1 v21.2D, v26.2D, v27.2D -trn1 v12.2D, v28.2D, v30.2D -sqrdmulh v30.4S, v11.4S, v7.4S -mul v11.4S, v11.4S,v6.4S -mla v11.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v17.4S -mul v22.4S, v22.4S,v19.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v7.4S -mul v8.4S, v8.4S,v6.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v17.4S -mul v18.4S, v18.4S,v19.4S -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v29.4S, v10.4S -mul v29.4S, v29.4S,v15.4S -mla v29.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v29.4s -add v13.4s, v13.4s, v29.4s -sqrdmulh v29.4S, v12.4S, v3.4S -mul v12.4S, v12.4S,v20.4S -mla v12.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v2.4S -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v9.4S -mul v8.4S, v8.4S,v1.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -str q13, [x0, #128] -str q18, [x0, #144] -str q30, [x0, #160] -str q12, [x0, #176] -str q21, [x0, #640] -str q29, [x0, #656] -str q11, [x0, #672] -str q22, [x0, #688] -ldr q9, [x17, #+512] -ldr q1, [x17, #+528] -ldr q3, [x17, #+544] -ldr q20, [x17, #+560] -ldr q17, [x17, #+576] -ldr q19, [x17, #+592] -ldr q0, [x17, #+608] -ldr q14, [x17, #+624] -ldr q22, [x0, #224] -ldr q11, [x0, #240] -ldr q29, [x0, #192] -ldr q21, [x0, #208] -ldr q16, [x17, #+1536] -ldr q2, [x17, #+1552] -ldr q10, [x17, #+1568] -ldr q15, [x17, #+1584] -ldr q7, [x17, #+1600] -ldr q6, [x17, #+1616] -ldr q5, [x17, #+1632] -ldr q4, [x17, #+1648] -ldr q12, [x0, #736] -ldr q30, [x0, #752] -ldr q18, [x0, #704] -ldr q13, [x0, #720] -sqrdmulh v8.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v2.s[0] -mul v12.4S, v12.4S,v16.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v1.s[0] -mul v11.4S, v11.4S,v9.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v30.4S, v2.s[0] -mul v30.4S, v30.4S,v16.s[0] -mla v30.4S, v11.4S, v31.s[0] -sub v11.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -sqrdmulh v30.4S, v21.4S, v1.s[1] -mul v21.4S, v21.4S,v9.s[1] -mla v21.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v21.4s -add v29.4s, v29.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v2.s[1] -mul v13.4S, v13.4S,v16.s[1] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v1.s[2] -mul v12.4S, v12.4S,v9.s[2] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v12.4s -add v8.4s, v8.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v2.s[2] -mul v11.4S, v11.4S,v16.s[2] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -trn1 v11.4S, v29.4S, v30.4S -trn2 v28.4S, v29.4S, v30.4S -trn1 v27.4S, v8.4S, v13.4S -trn2 v26.4S, v8.4S, v13.4S -trn2 v8.2D, v11.2D, v27.2D -trn2 v13.2D, v28.2D, v26.2D -trn1 v29.2D, v11.2D, v27.2D -trn1 v30.2D, v28.2D, v26.2D -trn1 v26.4S, v18.4S, v21.4S -trn2 v28.4S, v18.4S, v21.4S -trn1 v27.4S, v22.4S, v12.4S -trn2 v11.4S, v22.4S, v12.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v12.2D, v28.2D, v11.2D -trn1 v18.2D, v26.2D, v27.2D -trn1 v21.2D, v28.2D, v11.2D -sqrdmulh v11.4S, v8.4S, v20.4S -mul v8.4S, v8.4S,v3.4S -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v15.4S -mul v22.4S, v22.4S,v10.4S -mla v22.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v20.4S -mul v13.4S, v13.4S,v3.4S -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v13.4s -add v30.4s, v30.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v15.4S -mul v12.4S, v12.4S,v10.4S -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v30.4S, v19.4S -mul v30.4S, v30.4S,v17.4S -mla v30.4S, v12.4S, v31.s[0] -sub v12.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v21.4S, v6.4S -mul v21.4S, v21.4S,v7.4S -mla v21.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v14.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v4.4S -mul v13.4S, v13.4S,v5.4S -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -str q29, [x0, #192] -str q12, [x0, #208] -str q11, [x0, #224] -str q21, [x0, #240] -str q18, [x0, #704] -str q30, [x0, #720] -str q8, [x0, #736] -str q22, [x0, #752] -ldr q4, [x17, #+640] -ldr q5, [x17, #+656] -ldr q6, [x17, #+672] -ldr q7, [x17, #+688] -ldr q15, [x17, #+704] -ldr q10, [x17, #+720] -ldr q2, [x17, #+736] -ldr q16, [x17, #+752] -ldr q22, [x0, #288] -ldr q8, [x0, #304] -ldr q30, [x0, #256] -ldr q18, [x0, #272] -ldr q14, [x17, #+1664] -ldr q0, [x17, #+1680] -ldr q19, [x17, #+1696] -ldr q17, [x17, #+1712] -ldr q20, [x17, #+1728] -ldr q3, [x17, #+1744] -ldr q1, [x17, #+1760] -ldr q9, [x17, #+1776] -ldr q21, [x0, #800] -ldr q11, [x0, #816] -ldr q12, [x0, #768] -ldr q29, [x0, #784] -sqrdmulh v13.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v0.s[0] -mul v21.4S, v21.4S,v14.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v5.s[0] -mul v8.4S, v8.4S,v4.s[0] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v8.4s -add v18.4s, v18.4s, v8.4s -sqrdmulh v8.4S, v11.4S, v0.s[0] -mul v11.4S, v11.4S,v14.s[0] -mla v11.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v5.s[1] -mul v18.4S, v18.4S,v4.s[1] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sqrdmulh v18.4S, v29.4S, v0.s[1] -mul v29.4S, v29.4S,v14.s[1] -mla v29.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v29.4s -add v12.4s, v12.4s, v29.4s -sqrdmulh v29.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v0.s[2] -mul v8.4S, v8.4S,v14.s[2] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v8.4s -add v22.4s, v22.4s, v8.4s -trn1 v8.4S, v30.4S, v11.4S -trn2 v28.4S, v30.4S, v11.4S -trn1 v27.4S, v13.4S, v29.4S -trn2 v26.4S, v13.4S, v29.4S -trn2 v13.2D, v8.2D, v27.2D -trn2 v29.2D, v28.2D, v26.2D -trn1 v30.2D, v8.2D, v27.2D -trn1 v11.2D, v28.2D, v26.2D -trn1 v26.4S, v12.4S, v18.4S -trn2 v28.4S, v12.4S, v18.4S -trn1 v27.4S, v22.4S, v21.4S -trn2 v8.4S, v22.4S, v21.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v21.2D, v28.2D, v8.2D -trn1 v12.2D, v26.2D, v27.2D -trn1 v18.2D, v28.2D, v8.2D -sqrdmulh v8.4S, v13.4S, v7.4S -mul v13.4S, v13.4S,v6.4S -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v30.4s, v13.4s -add v30.4s, v30.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v17.4S -mul v22.4S, v22.4S,v19.4S -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v29.4S, v7.4S -mul v29.4S, v29.4S,v6.4S -mla v29.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v21.4S, v17.4S -mul v21.4S, v21.4S,v19.4S -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v10.4S -mul v11.4S, v11.4S,v15.4S -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v11.4s -add v30.4s, v30.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v3.4S -mul v18.4S, v18.4S,v20.4S -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v2.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v8.4s, v22.4s -add v8.4s, v8.4s, v22.4s -sqrdmulh v22.4S, v29.4S, v9.4S -mul v29.4S, v29.4S,v1.4S -mla v29.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v29.4s -add v13.4s, v13.4s, v29.4s -str q30, [x0, #256] -str q21, [x0, #272] -str q8, [x0, #288] -str q18, [x0, #304] -str q12, [x0, #768] -str q11, [x0, #784] -str q13, [x0, #800] -str q22, [x0, #816] -ldr q9, [x17, #+768] -ldr q1, [x17, #+784] -ldr q3, [x17, #+800] -ldr q20, [x17, #+816] -ldr q17, [x17, #+832] -ldr q19, [x17, #+848] -ldr q0, [x17, #+864] -ldr q14, [x17, #+880] -ldr q22, [x0, #352] -ldr q13, [x0, #368] -ldr q11, [x0, #320] -ldr q12, [x0, #336] -ldr q16, [x17, #+1792] -ldr q2, [x17, #+1808] -ldr q10, [x17, #+1824] -ldr q15, [x17, #+1840] -ldr q7, [x17, #+1856] -ldr q6, [x17, #+1872] -ldr q5, [x17, #+1888] -ldr q4, [x17, #+1904] -ldr q18, [x0, #864] -ldr q8, [x0, #880] -ldr q21, [x0, #832] -ldr q30, [x0, #848] -sqrdmulh v29.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v2.s[0] -mul v18.4S, v18.4S,v16.s[0] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v1.s[0] -mul v13.4S, v13.4S,v9.s[0] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v2.s[0] -mul v8.4S, v8.4S,v16.s[0] -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v1.s[1] -mul v12.4S, v12.4S,v9.s[1] -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v30.4S, v2.s[1] -mul v30.4S, v30.4S,v16.s[1] -mla v30.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v30.4s -add v21.4s, v21.4s, v30.4s -sqrdmulh v30.4S, v18.4S, v1.s[2] -mul v18.4S, v18.4S,v9.s[2] -mla v18.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v18.4s -add v29.4s, v29.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v2.s[2] -mul v13.4S, v13.4S,v16.s[2] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -trn1 v13.4S, v11.4S, v8.4S -trn2 v28.4S, v11.4S, v8.4S -trn1 v27.4S, v29.4S, v30.4S -trn2 v26.4S, v29.4S, v30.4S -trn2 v29.2D, v13.2D, v27.2D -trn2 v30.2D, v28.2D, v26.2D -trn1 v11.2D, v13.2D, v27.2D -trn1 v8.2D, v28.2D, v26.2D -trn1 v26.4S, v21.4S, v12.4S -trn2 v28.4S, v21.4S, v12.4S -trn1 v27.4S, v22.4S, v18.4S -trn2 v13.4S, v22.4S, v18.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v18.2D, v28.2D, v13.2D -trn1 v21.2D, v26.2D, v27.2D -trn1 v12.2D, v28.2D, v13.2D -sqrdmulh v13.4S, v29.4S, v20.4S -mul v29.4S, v29.4S,v3.4S -mla v29.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v15.4S -mul v22.4S, v22.4S,v10.4S -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v20.4S -mul v30.4S, v30.4S,v3.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v18.4S, v15.4S -mul v18.4S, v18.4S,v10.4S -mla v18.4S, v30.4S, v31.s[0] -sub v30.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v19.4S -mul v8.4S, v8.4S,v17.4S -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v6.4S -mul v12.4S, v12.4S,v7.4S -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v14.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v4.4S -mul v30.4S, v30.4S,v5.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -str q11, [x0, #320] -str q18, [x0, #336] -str q13, [x0, #352] -str q12, [x0, #368] -str q21, [x0, #832] -str q8, [x0, #848] -str q29, [x0, #864] -str q22, [x0, #880] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q6, [x17, #+928] -ldr q7, [x17, #+944] -ldr q15, [x17, #+960] -ldr q10, [x17, #+976] -ldr q2, [x17, #+992] -ldr q16, [x17, #+1008] -ldr q22, [x0, #416] -ldr q29, [x0, #432] -ldr q8, [x0, #384] -ldr q21, [x0, #400] -ldr q14, [x17, #+1920] -ldr q0, [x17, #+1936] -ldr q19, [x17, #+1952] -ldr q17, [x17, #+1968] -ldr q20, [x17, #+1984] -ldr q3, [x17, #+2000] -ldr q1, [x17, #+2016] -ldr q9, [x17, #+2032] -ldr q12, [x0, #928] -ldr q13, [x0, #944] -ldr q18, [x0, #896] -ldr q11, [x0, #912] -sqrdmulh v30.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v22.4s -add v8.4s, v8.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v14.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v29.4S, v5.s[0] -mul v29.4S, v29.4S,v4.s[0] -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v0.s[0] -mul v13.4S, v13.4S,v14.s[0] -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v21.4s -add v8.4s, v8.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v0.s[1] -mul v11.4S, v11.4S,v14.s[1] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v12.4s -add v30.4s, v30.4s, v12.4s -sqrdmulh v12.4S, v29.4S, v0.s[2] -mul v29.4S, v29.4S,v14.s[2] -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v29.4s -add v22.4s, v22.4s, v29.4s -trn1 v29.4S, v8.4S, v13.4S -trn2 v28.4S, v8.4S, v13.4S -trn1 v27.4S, v30.4S, v11.4S -trn2 v26.4S, v30.4S, v11.4S -trn2 v30.2D, v29.2D, v27.2D -trn2 v11.2D, v28.2D, v26.2D -trn1 v8.2D, v29.2D, v27.2D -trn1 v13.2D, v28.2D, v26.2D -trn1 v26.4S, v18.4S, v21.4S -trn2 v28.4S, v18.4S, v21.4S -trn1 v27.4S, v22.4S, v12.4S -trn2 v29.4S, v22.4S, v12.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v12.2D, v28.2D, v29.2D -trn1 v18.2D, v26.2D, v27.2D -trn1 v21.2D, v28.2D, v29.2D -sqrdmulh v29.4S, v30.4S, v7.4S -mul v30.4S, v30.4S,v6.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v17.4S -mul v22.4S, v22.4S,v19.4S -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v7.4S -mul v11.4S, v11.4S,v6.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v17.4S -mul v12.4S, v12.4S,v19.4S -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v3.4S -mul v21.4S, v21.4S,v20.4S -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v16.4S -mul v22.4S, v22.4S,v2.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v9.4S -mul v11.4S, v11.4S,v1.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v11.4s -add v30.4s, v30.4s, v11.4s -str q8, [x0, #384] -str q12, [x0, #400] -str q29, [x0, #416] -str q21, [x0, #432] -str q18, [x0, #896] -str q13, [x0, #912] -str q30, [x0, #928] -str q22, [x0, #944] -ldr q9, [x17, #+1024] -ldr q1, [x17, #+1040] -ldr q3, [x17, #+1056] -ldr q20, [x17, #+1072] -ldr q17, [x17, #+1088] -ldr q19, [x17, #+1104] -ldr q0, [x17, #+1120] -ldr q14, [x17, #+1136] -ldr q22, [x0, #480] -ldr q30, [x0, #496] -ldr q13, [x0, #448] -ldr q18, [x0, #464] -ldr q16, [x17, #+2048] -ldr q2, [x17, #+2064] -ldr q10, [x17, #+2080] -ldr q15, [x17, #+2096] -ldr q7, [x17, #+2112] -ldr q6, [x17, #+2128] -ldr q5, [x17, #+2144] -ldr q4, [x17, #+2160] -ldr q21, [x0, #992] -ldr q29, [x0, #1008] -ldr q12, [x0, #960] -ldr q8, [x0, #976] -sqrdmulh v11.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v2.s[0] -mul v21.4S, v21.4S,v16.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v30.4s -add v18.4s, v18.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v2.s[0] -mul v29.4S, v29.4S,v16.s[0] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -sqrdmulh v29.4S, v18.4S, v1.s[1] -mul v18.4S, v18.4S,v9.s[1] -mla v18.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v2.s[1] -mul v8.4S, v8.4S,v16.s[1] -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v8.4s -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v1.s[2] -mul v21.4S, v21.4S,v9.s[2] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v2.s[2] -mul v30.4S, v30.4S,v16.s[2] -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -trn1 v30.4S, v13.4S, v29.4S -trn2 v28.4S, v13.4S, v29.4S -trn1 v27.4S, v11.4S, v8.4S -trn2 v26.4S, v11.4S, v8.4S -trn2 v11.2D, v30.2D, v27.2D -trn2 v8.2D, v28.2D, v26.2D -trn1 v13.2D, v30.2D, v27.2D -trn1 v29.2D, v28.2D, v26.2D -trn1 v26.4S, v12.4S, v18.4S -trn2 v28.4S, v12.4S, v18.4S -trn1 v27.4S, v22.4S, v21.4S -trn2 v30.4S, v22.4S, v21.4S -trn2 v22.2D, v26.2D, v27.2D -trn2 v21.2D, v28.2D, v30.2D -trn1 v12.2D, v26.2D, v27.2D -trn1 v18.2D, v28.2D, v30.2D -sqrdmulh v30.4S, v11.4S, v20.4S -mul v11.4S, v11.4S,v3.4S -mla v11.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v15.4S -mul v22.4S, v22.4S,v10.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v20.4S -mul v8.4S, v8.4S,v3.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v15.4S -mul v21.4S, v21.4S,v10.4S -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v29.4S, v19.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v29.4s -add v13.4s, v13.4s, v29.4s -sqrdmulh v29.4S, v18.4S, v6.4S -mul v18.4S, v18.4S,v7.4S -mla v18.4S, v29.4S, v31.s[0] -sub v29.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v14.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v4.4S -mul v8.4S, v8.4S,v5.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -str q13, [x0, #448] -str q21, [x0, #464] -str q30, [x0, #480] -str q18, [x0, #496] -str q12, [x0, #960] -str q29, [x0, #976] -str q11, [x0, #992] -str q22, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s deleted file mode 100644 index 61444d0..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_2.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z2_2 -.global _ntt_u32_full_neon_asm_var_4_4_3_z2_2 -ntt_u32_full_neon_asm_var_4_4_3_z2_2: -_ntt_u32_full_neon_asm_var_4_4_3_z2_2: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #16] -ldr q14, [x17, #+1152] -ldr q0, [x17, #+1168] -ldr q19, [x17, #+1184] -ldr q17, [x17, #+1200] -ldr q20, [x17, #+1216] -ldr q3, [x17, #+1232] -ldr q1, [x17, #+1248] -ldr q9, [x17, #+1264] -ldr q12, [x0, #544] -ldr q8, [x0, #560] -ldr q18, [x0, #512] -ldr q30, [x0, #528] -sqrdmulh v29.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v14.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v0.s[0] -mul v8.4S, v8.4S,v14.s[0] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v0.s[1] -mul v30.4S, v30.4S,v14.s[1] -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v30.4s -add v18.4s, v18.4s, v30.4s -sqrdmulh v30.4S, v12.4S, v0.s[2] -mul v12.4S, v12.4S,v14.s[2] -mla v12.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -trn1 v12.4S, v11.4S, v8.4S -trn2 v28.4S, v11.4S, v8.4S -trn1 v27.4S, v29.4S, v21.4S -trn2 v26.4S, v29.4S, v21.4S -trn2 v29.2D, v12.2D, v27.2D -trn2 v21.2D, v28.2D, v26.2D -trn1 v11.2D, v12.2D, v27.2D -trn1 v8.2D, v28.2D, v26.2D -trn1 v26.4S, v18.4S, v22.4S -trn2 v28.4S, v18.4S, v22.4S -trn1 v27.4S, v13.4S, v30.4S -trn2 v12.4S, v13.4S, v30.4S -trn2 v13.2D, v26.2D, v27.2D -trn2 v30.2D, v28.2D, v12.2D -trn1 v18.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v12.2D -sqrdmulh v12.4S, v29.4S, v7.4S -mul v29.4S, v29.4S,v6.4S -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v8.4s, v21.4s -add v8.4s, v8.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v17.4S -mul v13.4S, v13.4S,v19.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v17.4S -mul v30.4S, v30.4S,v19.4S -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -sqrdmulh v30.4S, v8.4S, v10.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v30.4S, v31.s[0] -sub v30.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -sqrdmulh v8.4S, v29.4S, v16.4S -mul v29.4S, v29.4S,v2.4S -mla v29.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v29.4s -add v12.4s, v12.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v3.4S -mul v22.4S, v22.4S,v20.4S -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v9.4S -mul v13.4S, v13.4S,v1.4S -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -str q11, [x0, #0] -str q30, [x0, #16] -str q12, [x0, #32] -str q8, [x0, #48] -str q18, [x0, #512] -str q29, [x0, #528] -str q21, [x0, #544] -str q22, [x0, #560] -ldr q9, [x17, #+256] -ldr q1, [x17, #+272] -ldr q3, [x17, #+288] -ldr q20, [x17, #+304] -ldr q17, [x17, #+320] -ldr q19, [x17, #+336] -ldr q0, [x17, #+352] -ldr q14, [x17, #+368] -ldr q22, [x0, #96] -ldr q21, [x0, #112] -ldr q29, [x0, #64] -ldr q18, [x0, #80] -ldr q16, [x17, #+1280] -ldr q2, [x17, #+1296] -ldr q10, [x17, #+1312] -ldr q15, [x17, #+1328] -ldr q7, [x17, #+1344] -ldr q6, [x17, #+1360] -ldr q5, [x17, #+1376] -ldr q4, [x17, #+1392] -ldr q8, [x0, #608] -ldr q12, [x0, #624] -ldr q30, [x0, #576] -ldr q11, [x0, #592] -sqrdmulh v13.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v1.s[0] -mul v21.4S, v21.4S,v9.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v2.s[0] -mul v8.4S, v8.4S,v16.s[0] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v2.s[0] -mul v12.4S, v12.4S,v16.s[0] -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v1.s[1] -mul v18.4S, v18.4S,v9.s[1] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v29.4s, v18.4s -add v29.4s, v29.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v1.s[2] -mul v22.4S, v22.4S,v9.s[2] -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v11.4s -add v30.4s, v30.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v2.s[2] -mul v8.4S, v8.4S,v16.s[2] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -trn1 v8.4S, v29.4S, v12.4S -trn2 v28.4S, v29.4S, v12.4S -trn1 v27.4S, v13.4S, v18.4S -trn2 v26.4S, v13.4S, v18.4S -trn2 v13.2D, v8.2D, v27.2D -trn2 v18.2D, v28.2D, v26.2D -trn1 v29.2D, v8.2D, v27.2D -trn1 v12.2D, v28.2D, v26.2D -trn1 v26.4S, v30.4S, v22.4S -trn2 v28.4S, v30.4S, v22.4S -trn1 v27.4S, v21.4S, v11.4S -trn2 v8.4S, v21.4S, v11.4S -trn2 v21.2D, v26.2D, v27.2D -trn2 v11.2D, v28.2D, v8.2D -trn1 v30.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v8.2D -sqrdmulh v8.4S, v13.4S, v20.4S -mul v13.4S, v13.4S,v3.4S -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v13.4s -add v29.4s, v29.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v20.4S -mul v18.4S, v18.4S,v3.4S -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v15.4S -mul v21.4S, v21.4S,v10.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v30.4s, v21.4s -add v30.4s, v30.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v19.4S -mul v12.4S, v12.4S,v17.4S -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v12.4s -add v29.4s, v29.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v14.4S -mul v13.4S, v13.4S,v0.4S -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v4.4S -mul v21.4S, v21.4S,v5.4S -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -str q29, [x0, #64] -str q11, [x0, #80] -str q8, [x0, #96] -str q12, [x0, #112] -str q30, [x0, #576] -str q13, [x0, #592] -str q18, [x0, #608] -str q22, [x0, #624] -ldr q4, [x17, #+384] -ldr q5, [x17, #+400] -ldr q6, [x17, #+416] -ldr q7, [x17, #+432] -ldr q15, [x17, #+448] -ldr q10, [x17, #+464] -ldr q2, [x17, #+480] -ldr q16, [x17, #+496] -ldr q22, [x0, #160] -ldr q18, [x0, #176] -ldr q13, [x0, #128] -ldr q30, [x0, #144] -ldr q14, [x17, #+1408] -ldr q0, [x17, #+1424] -ldr q19, [x17, #+1440] -ldr q17, [x17, #+1456] -ldr q20, [x17, #+1472] -ldr q3, [x17, #+1488] -ldr q1, [x17, #+1504] -ldr q9, [x17, #+1520] -ldr q12, [x0, #672] -ldr q8, [x0, #688] -ldr q11, [x0, #640] -ldr q29, [x0, #656] -sqrdmulh v21.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v5.s[0] -mul v18.4S, v18.4S,v4.s[0] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v14.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v0.s[0] -mul v8.4S, v8.4S,v14.s[0] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v30.4S, v5.s[1] -mul v30.4S, v30.4S,v4.s[1] -mla v30.4S, v8.4S, v31.s[0] -sub v8.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v29.4S, v0.s[1] -mul v29.4S, v29.4S,v14.s[1] -mla v29.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v12.4S, v0.s[2] -mul v12.4S, v12.4S,v14.s[2] -mla v12.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -trn1 v12.4S, v13.4S, v8.4S -trn2 v28.4S, v13.4S, v8.4S -trn1 v27.4S, v21.4S, v30.4S -trn2 v26.4S, v21.4S, v30.4S -trn2 v21.2D, v12.2D, v27.2D -trn2 v30.2D, v28.2D, v26.2D -trn1 v13.2D, v12.2D, v27.2D -trn1 v8.2D, v28.2D, v26.2D -trn1 v26.4S, v11.4S, v22.4S -trn2 v28.4S, v11.4S, v22.4S -trn1 v27.4S, v18.4S, v29.4S -trn2 v12.4S, v18.4S, v29.4S -trn2 v18.2D, v26.2D, v27.2D -trn2 v29.2D, v28.2D, v12.2D -trn1 v11.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v12.2D -sqrdmulh v12.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v7.4S -mul v30.4S, v30.4S,v6.4S -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v18.4S, v17.4S -mul v18.4S, v18.4S,v19.4S -mla v18.4S, v30.4S, v31.s[0] -sub v30.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v29.4S, v17.4S -mul v29.4S, v29.4S,v19.4S -mla v29.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v29.4s -add v22.4s, v22.4s, v29.4s -sqrdmulh v29.4S, v8.4S, v10.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v8.4s -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v16.4S -mul v21.4S, v21.4S,v2.4S -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v3.4S -mul v22.4S, v22.4S,v20.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v9.4S -mul v18.4S, v18.4S,v1.4S -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -str q13, [x0, #128] -str q29, [x0, #144] -str q12, [x0, #160] -str q8, [x0, #176] -str q11, [x0, #640] -str q21, [x0, #656] -str q30, [x0, #672] -str q22, [x0, #688] -ldr q9, [x17, #+512] -ldr q1, [x17, #+528] -ldr q3, [x17, #+544] -ldr q20, [x17, #+560] -ldr q17, [x17, #+576] -ldr q19, [x17, #+592] -ldr q0, [x17, #+608] -ldr q14, [x17, #+624] -ldr q22, [x0, #224] -ldr q30, [x0, #240] -ldr q21, [x0, #192] -ldr q11, [x0, #208] -ldr q16, [x17, #+1536] -ldr q2, [x17, #+1552] -ldr q10, [x17, #+1568] -ldr q15, [x17, #+1584] -ldr q7, [x17, #+1600] -ldr q6, [x17, #+1616] -ldr q5, [x17, #+1632] -ldr q4, [x17, #+1648] -ldr q8, [x0, #736] -ldr q12, [x0, #752] -ldr q29, [x0, #704] -ldr q13, [x0, #720] -sqrdmulh v18.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v9.s[0] -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v30.4s -add v11.4s, v11.4s, v30.4s -sqrdmulh v30.4S, v8.4S, v2.s[0] -mul v8.4S, v8.4S,v16.s[0] -mla v8.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v2.s[0] -mul v12.4S, v12.4S,v16.s[0] -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v1.s[1] -mul v11.4S, v11.4S,v9.s[1] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v1.s[2] -mul v22.4S, v22.4S,v9.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v2.s[1] -mul v13.4S, v13.4S,v16.s[1] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v13.4s -add v29.4s, v29.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v2.s[2] -mul v8.4S, v8.4S,v16.s[2] -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -trn1 v8.4S, v21.4S, v12.4S -trn2 v28.4S, v21.4S, v12.4S -trn1 v27.4S, v18.4S, v11.4S -trn2 v26.4S, v18.4S, v11.4S -trn2 v18.2D, v8.2D, v27.2D -trn2 v11.2D, v28.2D, v26.2D -trn1 v21.2D, v8.2D, v27.2D -trn1 v12.2D, v28.2D, v26.2D -trn1 v26.4S, v29.4S, v22.4S -trn2 v28.4S, v29.4S, v22.4S -trn1 v27.4S, v30.4S, v13.4S -trn2 v8.4S, v30.4S, v13.4S -trn2 v30.2D, v26.2D, v27.2D -trn2 v13.2D, v28.2D, v8.2D -trn1 v29.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v8.2D -sqrdmulh v8.4S, v18.4S, v20.4S -mul v18.4S, v18.4S,v3.4S -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v20.4S -mul v11.4S, v11.4S,v3.4S -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v30.4S, v15.4S -mul v30.4S, v30.4S,v10.4S -mla v30.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v13.4S, v15.4S -mul v13.4S, v13.4S,v10.4S -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v19.4S -mul v12.4S, v12.4S,v17.4S -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v14.4S -mul v18.4S, v18.4S,v0.4S -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v4.4S -mul v30.4S, v30.4S,v5.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v30.4s -add v11.4s, v11.4s, v30.4s -str q21, [x0, #192] -str q13, [x0, #208] -str q8, [x0, #224] -str q12, [x0, #240] -str q29, [x0, #704] -str q18, [x0, #720] -str q11, [x0, #736] -str q22, [x0, #752] -ldr q4, [x17, #+640] -ldr q5, [x17, #+656] -ldr q6, [x17, #+672] -ldr q7, [x17, #+688] -ldr q15, [x17, #+704] -ldr q10, [x17, #+720] -ldr q2, [x17, #+736] -ldr q16, [x17, #+752] -ldr q22, [x0, #288] -ldr q11, [x0, #304] -ldr q18, [x0, #256] -ldr q29, [x0, #272] -ldr q14, [x17, #+1664] -ldr q0, [x17, #+1680] -ldr q19, [x17, #+1696] -ldr q17, [x17, #+1712] -ldr q20, [x17, #+1728] -ldr q3, [x17, #+1744] -ldr q1, [x17, #+1760] -ldr q9, [x17, #+1776] -ldr q12, [x0, #800] -ldr q8, [x0, #816] -ldr q13, [x0, #768] -ldr q21, [x0, #784] -sqrdmulh v30.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v5.s[0] -mul v11.4S, v11.4S,v4.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v14.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v0.s[0] -mul v8.4S, v8.4S,v14.s[0] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v29.4S, v5.s[1] -mul v29.4S, v29.4S,v4.s[1] -mla v29.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v29.4s -add v18.4s, v18.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v0.s[1] -mul v21.4S, v21.4S,v14.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v0.s[2] -mul v12.4S, v12.4S,v14.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -trn1 v12.4S, v18.4S, v8.4S -trn2 v28.4S, v18.4S, v8.4S -trn1 v27.4S, v30.4S, v29.4S -trn2 v26.4S, v30.4S, v29.4S -trn2 v30.2D, v12.2D, v27.2D -trn2 v29.2D, v28.2D, v26.2D -trn1 v18.2D, v12.2D, v27.2D -trn1 v8.2D, v28.2D, v26.2D -trn1 v26.4S, v13.4S, v22.4S -trn2 v28.4S, v13.4S, v22.4S -trn1 v27.4S, v11.4S, v21.4S -trn2 v12.4S, v11.4S, v21.4S -trn2 v11.2D, v26.2D, v27.2D -trn2 v21.2D, v28.2D, v12.2D -trn1 v13.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v12.2D -sqrdmulh v12.4S, v30.4S, v7.4S -mul v30.4S, v30.4S,v6.4S -mla v30.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v30.4s -add v18.4s, v18.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v7.4S -mul v29.4S, v29.4S,v6.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -sqrdmulh v29.4S, v11.4S, v17.4S -mul v11.4S, v11.4S,v19.4S -mla v11.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v17.4S -mul v21.4S, v21.4S,v19.4S -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v10.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v8.4s -add v18.4s, v18.4s, v8.4s -sqrdmulh v8.4S, v30.4S, v16.4S -mul v30.4S, v30.4S,v2.4S -mla v30.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v30.4s -add v12.4s, v12.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v3.4S -mul v22.4S, v22.4S,v20.4S -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v9.4S -mul v11.4S, v11.4S,v1.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -str q18, [x0, #256] -str q21, [x0, #272] -str q12, [x0, #288] -str q8, [x0, #304] -str q13, [x0, #768] -str q30, [x0, #784] -str q29, [x0, #800] -str q22, [x0, #816] -ldr q9, [x17, #+768] -ldr q1, [x17, #+784] -ldr q3, [x17, #+800] -ldr q20, [x17, #+816] -ldr q17, [x17, #+832] -ldr q19, [x17, #+848] -ldr q0, [x17, #+864] -ldr q14, [x17, #+880] -ldr q22, [x0, #352] -ldr q29, [x0, #368] -ldr q30, [x0, #320] -ldr q13, [x0, #336] -ldr q16, [x17, #+1792] -ldr q2, [x17, #+1808] -ldr q10, [x17, #+1824] -ldr q15, [x17, #+1840] -ldr q7, [x17, #+1856] -ldr q6, [x17, #+1872] -ldr q5, [x17, #+1888] -ldr q4, [x17, #+1904] -ldr q8, [x0, #864] -ldr q12, [x0, #880] -ldr q21, [x0, #832] -ldr q18, [x0, #848] -sqrdmulh v11.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v29.4S, v1.s[0] -mul v29.4S, v29.4S,v9.s[0] -mla v29.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v29.4s -add v13.4s, v13.4s, v29.4s -sqrdmulh v29.4S, v8.4S, v2.s[0] -mul v8.4S, v8.4S,v16.s[0] -mla v8.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v2.s[0] -mul v12.4S, v12.4S,v16.s[0] -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v1.s[1] -mul v13.4S, v13.4S,v9.s[1] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v30.4s, v13.4s -add v30.4s, v30.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v1.s[2] -mul v22.4S, v22.4S,v9.s[2] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v2.s[1] -mul v18.4S, v18.4S,v16.s[1] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v2.s[2] -mul v8.4S, v8.4S,v16.s[2] -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v29.4s, v8.4s -add v29.4s, v29.4s, v8.4s -trn1 v8.4S, v30.4S, v12.4S -trn2 v28.4S, v30.4S, v12.4S -trn1 v27.4S, v11.4S, v13.4S -trn2 v26.4S, v11.4S, v13.4S -trn2 v11.2D, v8.2D, v27.2D -trn2 v13.2D, v28.2D, v26.2D -trn1 v30.2D, v8.2D, v27.2D -trn1 v12.2D, v28.2D, v26.2D -trn1 v26.4S, v21.4S, v22.4S -trn2 v28.4S, v21.4S, v22.4S -trn1 v27.4S, v29.4S, v18.4S -trn2 v8.4S, v29.4S, v18.4S -trn2 v29.2D, v26.2D, v27.2D -trn2 v18.2D, v28.2D, v8.2D -trn1 v21.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v8.2D -sqrdmulh v8.4S, v11.4S, v20.4S -mul v11.4S, v11.4S,v3.4S -mla v11.4S, v8.4S, v31.s[0] -sub v8.4s, v30.4s, v11.4s -add v30.4s, v30.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v20.4S -mul v13.4S, v13.4S,v3.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -sqrdmulh v13.4S, v29.4S, v15.4S -mul v29.4S, v29.4S,v10.4S -mla v29.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v18.4S, v15.4S -mul v18.4S, v18.4S,v10.4S -mla v18.4S, v29.4S, v31.s[0] -sub v29.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v19.4S -mul v12.4S, v12.4S,v17.4S -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v30.4s, v12.4s -add v30.4s, v30.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v14.4S -mul v11.4S, v11.4S,v0.4S -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v29.4S, v4.4S -mul v29.4S, v29.4S,v5.4S -mla v29.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v29.4s -add v13.4s, v13.4s, v29.4s -str q30, [x0, #320] -str q18, [x0, #336] -str q8, [x0, #352] -str q12, [x0, #368] -str q21, [x0, #832] -str q11, [x0, #848] -str q13, [x0, #864] -str q22, [x0, #880] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q6, [x17, #+928] -ldr q7, [x17, #+944] -ldr q15, [x17, #+960] -ldr q10, [x17, #+976] -ldr q2, [x17, #+992] -ldr q16, [x17, #+1008] -ldr q22, [x0, #416] -ldr q13, [x0, #432] -ldr q11, [x0, #384] -ldr q21, [x0, #400] -ldr q14, [x17, #+1920] -ldr q0, [x17, #+1936] -ldr q19, [x17, #+1952] -ldr q17, [x17, #+1968] -ldr q20, [x17, #+1984] -ldr q3, [x17, #+2000] -ldr q1, [x17, #+2016] -ldr q9, [x17, #+2032] -ldr q12, [x0, #928] -ldr q8, [x0, #944] -ldr q18, [x0, #896] -ldr q30, [x0, #912] -sqrdmulh v29.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v14.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v0.s[0] -mul v8.4S, v8.4S,v14.s[0] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v0.s[1] -mul v30.4S, v30.4S,v14.s[1] -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v30.4s -add v18.4s, v18.4s, v30.4s -sqrdmulh v30.4S, v12.4S, v0.s[2] -mul v12.4S, v12.4S,v14.s[2] -mla v12.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -trn1 v12.4S, v11.4S, v8.4S -trn2 v28.4S, v11.4S, v8.4S -trn1 v27.4S, v29.4S, v21.4S -trn2 v26.4S, v29.4S, v21.4S -trn2 v29.2D, v12.2D, v27.2D -trn2 v21.2D, v28.2D, v26.2D -trn1 v11.2D, v12.2D, v27.2D -trn1 v8.2D, v28.2D, v26.2D -trn1 v26.4S, v18.4S, v22.4S -trn2 v28.4S, v18.4S, v22.4S -trn1 v27.4S, v13.4S, v30.4S -trn2 v12.4S, v13.4S, v30.4S -trn2 v13.2D, v26.2D, v27.2D -trn2 v30.2D, v28.2D, v12.2D -trn1 v18.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v12.2D -sqrdmulh v12.4S, v29.4S, v7.4S -mul v29.4S, v29.4S,v6.4S -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v8.4s, v21.4s -add v8.4s, v8.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v17.4S -mul v13.4S, v13.4S,v19.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v17.4S -mul v30.4S, v30.4S,v19.4S -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -sqrdmulh v30.4S, v8.4S, v10.4S -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v30.4S, v31.s[0] -sub v30.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -sqrdmulh v8.4S, v29.4S, v16.4S -mul v29.4S, v29.4S,v2.4S -mla v29.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v29.4s -add v12.4s, v12.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v3.4S -mul v22.4S, v22.4S,v20.4S -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v9.4S -mul v13.4S, v13.4S,v1.4S -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -str q11, [x0, #384] -str q30, [x0, #400] -str q12, [x0, #416] -str q8, [x0, #432] -str q18, [x0, #896] -str q29, [x0, #912] -str q21, [x0, #928] -str q22, [x0, #944] -ldr q9, [x17, #+1024] -ldr q1, [x17, #+1040] -ldr q3, [x17, #+1056] -ldr q20, [x17, #+1072] -ldr q17, [x17, #+1088] -ldr q19, [x17, #+1104] -ldr q0, [x17, #+1120] -ldr q14, [x17, #+1136] -ldr q22, [x0, #480] -ldr q21, [x0, #496] -ldr q29, [x0, #448] -ldr q18, [x0, #464] -ldr q16, [x17, #+2048] -ldr q2, [x17, #+2064] -ldr q10, [x17, #+2080] -ldr q15, [x17, #+2096] -ldr q7, [x17, #+2112] -ldr q6, [x17, #+2128] -ldr q5, [x17, #+2144] -ldr q4, [x17, #+2160] -ldr q8, [x0, #992] -ldr q12, [x0, #1008] -ldr q30, [x0, #960] -ldr q11, [x0, #976] -sqrdmulh v13.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v9.s[0] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v1.s[0] -mul v21.4S, v21.4S,v9.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v2.s[0] -mul v8.4S, v8.4S,v16.s[0] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v2.s[0] -mul v12.4S, v12.4S,v16.s[0] -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v1.s[1] -mul v18.4S, v18.4S,v9.s[1] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v29.4s, v18.4s -add v29.4s, v29.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v1.s[2] -mul v22.4S, v22.4S,v9.s[2] -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v11.4s -add v30.4s, v30.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v2.s[2] -mul v8.4S, v8.4S,v16.s[2] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -trn1 v8.4S, v29.4S, v12.4S -trn2 v28.4S, v29.4S, v12.4S -trn1 v27.4S, v13.4S, v18.4S -trn2 v26.4S, v13.4S, v18.4S -trn2 v13.2D, v8.2D, v27.2D -trn2 v18.2D, v28.2D, v26.2D -trn1 v29.2D, v8.2D, v27.2D -trn1 v12.2D, v28.2D, v26.2D -trn1 v26.4S, v30.4S, v22.4S -trn2 v28.4S, v30.4S, v22.4S -trn1 v27.4S, v21.4S, v11.4S -trn2 v8.4S, v21.4S, v11.4S -trn2 v21.2D, v26.2D, v27.2D -trn2 v11.2D, v28.2D, v8.2D -trn1 v30.2D, v26.2D, v27.2D -trn1 v22.2D, v28.2D, v8.2D -sqrdmulh v8.4S, v13.4S, v20.4S -mul v13.4S, v13.4S,v3.4S -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v13.4s -add v29.4s, v29.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v20.4S -mul v18.4S, v18.4S,v3.4S -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v15.4S -mul v21.4S, v21.4S,v10.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v30.4s, v21.4s -add v30.4s, v30.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v19.4S -mul v12.4S, v12.4S,v17.4S -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v12.4s -add v29.4s, v29.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v14.4S -mul v13.4S, v13.4S,v0.4S -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v4.4S -mul v21.4S, v21.4S,v5.4S -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -str q29, [x0, #448] -str q11, [x0, #464] -str q8, [x0, #480] -str q12, [x0, #496] -str q30, [x0, #960] -str q13, [x0, #976] -str q18, [x0, #992] -str q22, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s deleted file mode 100644 index 097a1c9..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_3.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z2_3 -.global _ntt_u32_full_neon_asm_var_4_4_3_z2_3 -ntt_u32_full_neon_asm_var_4_4_3_z2_3: -_ntt_u32_full_neon_asm_var_4_4_3_z2_3: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x17, #+128] -ldr q7, [x17, #+144] -ldr q15, [x0, #0] -ldr q10, [x0, #16] -ldr q2, [x17, #+1152] -ldr q16, [x17, #+1168] -sqrdmulh v22.4S, v4.4S, v7.s[0] -ldr q13, [x0, #544] -mul v4.4S, v4.4S,v6.s[0] -ldr q11, [x0, #560] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v5.4S, v7.s[0] -ldr q21, [x0, #512] -mul v5.4S, v5.4S,v6.s[0] -ldr q14, [x0, #528] -mla v5.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v16.s[0] -mul v13.4S, v13.4S,v2.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v16.s[0] -mul v11.4S, v11.4S,v2.s[0] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v11.4s -add v14.4s, v14.4s, v11.4s -sqrdmulh v11.4S, v10.4S, v7.s[1] -mul v10.4S, v10.4S,v6.s[1] -mla v10.4S, v11.4S, v31.s[0] -sub v11.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v7.s[2] -mul v4.4S, v4.4S,v6.s[2] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v14.4S, v16.s[1] -mul v14.4S, v14.4S,v2.s[1] -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v13.4S, v16.s[2] -mul v13.4S, v13.4S,v2.s[2] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v5.4s, v13.4s -add v5.4s, v5.4s, v13.4s -trn1 v13.4S, v15.4S, v11.4S -trn2 v0.4S, v15.4S, v11.4S -trn1 v19.4S, v22.4S, v10.4S -trn2 v17.4S, v22.4S, v10.4S -trn2 v22.2D, v13.2D, v19.2D -trn2 v10.2D, v0.2D, v17.2D -trn1 v15.2D, v13.2D, v19.2D -trn1 v11.2D, v0.2D, v17.2D -ldr q17, [x17, #+160] -ldr q0, [x17, #+176] -trn1 v19.4S, v21.4S, v4.4S -trn2 v13.4S, v21.4S, v4.4S -trn1 v20.4S, v5.4S, v14.4S -trn2 v3.4S, v5.4S, v14.4S -trn2 v5.2D, v19.2D, v20.2D -trn2 v14.2D, v13.2D, v3.2D -trn1 v21.2D, v19.2D, v20.2D -trn1 v4.2D, v13.2D, v3.2D -ldr q3, [x17, #+1184] -ldr q13, [x17, #+1200] -sqrdmulh v20.4S, v22.4S, v0.4S -mul v22.4S, v22.4S,v17.4S -mla v22.4S, v20.4S, v31.s[0] -sub v20.4s, v15.4s, v22.4s -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v17.4S -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v13.4S -mul v5.4S, v5.4S,v3.4S -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -ldr q5, [x17, #+192] -ldr q19, [x17, #+208] -sqrdmulh v1.4S, v14.4S, v13.4S -mul v14.4S, v14.4S,v3.4S -mla v14.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x17, #+224] -ldr q9, [x17, #+240] -sqrdmulh v12.4S, v11.4S, v19.4S -mul v11.4S, v11.4S,v5.4S -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -ldr q11, [x17, #+1216] -ldr q8, [x17, #+1232] -sqrdmulh v18.4S, v22.4S, v9.4S -mul v22.4S, v22.4S,v14.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v22.4s -add v20.4s, v20.4s, v22.4s -ldr q22, [x17, #+1248] -ldr q30, [x17, #+1264] -sqrdmulh v29.4S, v4.4S, v8.4S -ldr q28, [x0, #96] -mul v4.4S, v4.4S,v11.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -sqrdmulh v4.4S, v1.4S, v30.4S -ldr q27, [x0, #112] -mul v1.4S, v1.4S,v22.4S -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -str q15, [x0, #0] -str q12, [x0, #16] -str q20, [x0, #32] -str q18, [x0, #48] -str q21, [x0, #512] -str q29, [x0, #528] -str q10, [x0, #544] -str q4, [x0, #560] -ldr q30, [x17, #+256] -ldr q22, [x17, #+272] -ldr q8, [x0, #64] -ldr q11, [x0, #80] -ldr q13, [x17, #+1280] -ldr q3, [x17, #+1296] -sqrdmulh v16.4S, v28.4S, v22.s[0] -ldr q2, [x0, #608] -mul v28.4S, v28.4S,v30.s[0] -ldr q4, [x0, #624] -mla v28.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v28.4s -add v8.4s, v8.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v22.s[0] -ldr q10, [x0, #576] -mul v27.4S, v27.4S,v30.s[0] -ldr q29, [x0, #592] -mla v27.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v27.4s -add v11.4s, v11.4s, v27.4s -sqrdmulh v27.4S, v2.4S, v3.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v27.4S, v31.s[0] -sub v27.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v13.s[0] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v30.s[1] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v22.s[2] -mul v28.4S, v28.4S,v30.s[2] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v28.4s -add v16.4s, v16.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v3.s[1] -mul v29.4S, v29.4S,v13.s[1] -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v29.4s -add v10.4s, v10.4s, v29.4s -sqrdmulh v29.4S, v2.4S, v3.s[2] -mul v2.4S, v2.4S,v13.s[2] -mla v2.4S, v29.4S, v31.s[0] -sub v29.4s, v27.4s, v2.4s -add v27.4s, v27.4s, v2.4s -trn1 v2.4S, v8.4S, v4.4S -trn2 v21.4S, v8.4S, v4.4S -trn1 v9.4S, v16.4S, v11.4S -trn2 v14.4S, v16.4S, v11.4S -trn2 v16.2D, v2.2D, v9.2D -trn2 v11.2D, v21.2D, v14.2D -trn1 v8.2D, v2.2D, v9.2D -trn1 v4.2D, v21.2D, v14.2D -ldr q14, [x17, #+288] -ldr q21, [x17, #+304] -trn1 v9.4S, v10.4S, v28.4S -trn2 v2.4S, v10.4S, v28.4S -trn1 v19.4S, v27.4S, v29.4S -trn2 v5.4S, v27.4S, v29.4S -trn2 v27.2D, v9.2D, v19.2D -trn2 v29.2D, v2.2D, v5.2D -trn1 v10.2D, v9.2D, v19.2D -trn1 v28.2D, v2.2D, v5.2D -ldr q5, [x17, #+1312] -ldr q2, [x17, #+1328] -sqrdmulh v19.4S, v16.4S, v21.4S -mul v16.4S, v16.4S,v14.4S -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v11.4S, v21.4S -mul v11.4S, v11.4S,v14.4S -mla v11.4S, v16.4S, v31.s[0] -sub v16.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v27.4S, v2.4S -mul v27.4S, v27.4S,v5.4S -mla v27.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v27.4s -add v10.4s, v10.4s, v27.4s -ldr q27, [x17, #+320] -ldr q9, [x17, #+336] -sqrdmulh v0.4S, v29.4S, v2.4S -mul v29.4S, v29.4S,v5.4S -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -ldr q29, [x17, #+352] -ldr q17, [x17, #+368] -sqrdmulh v7.4S, v4.4S, v9.4S -mul v4.4S, v4.4S,v27.4S -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -ldr q4, [x17, #+1344] -ldr q6, [x17, #+1360] -sqrdmulh v18.4S, v16.4S, v17.4S -mul v16.4S, v16.4S,v29.4S -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v16.4s -add v19.4s, v19.4s, v16.4s -ldr q16, [x17, #+1376] -ldr q20, [x17, #+1392] -sqrdmulh v12.4S, v28.4S, v6.4S -ldr q15, [x0, #160] -mul v28.4S, v28.4S,v4.4S -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v0.4S, v20.4S -ldr q1, [x0, #176] -mul v0.4S, v0.4S,v16.4S -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -str q8, [x0, #64] -str q7, [x0, #80] -str q19, [x0, #96] -str q18, [x0, #112] -str q10, [x0, #576] -str q12, [x0, #592] -str q11, [x0, #608] -str q28, [x0, #624] -ldr q20, [x17, #+384] -ldr q16, [x17, #+400] -ldr q6, [x0, #128] -ldr q4, [x0, #144] -ldr q2, [x17, #+1408] -ldr q5, [x17, #+1424] -sqrdmulh v3.4S, v15.4S, v16.s[0] -ldr q13, [x0, #672] -mul v15.4S, v15.4S,v20.s[0] -ldr q28, [x0, #688] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v15.4s -add v6.4s, v6.4s, v15.4s -sqrdmulh v15.4S, v1.4S, v16.s[0] -ldr q11, [x0, #640] -mul v1.4S, v1.4S,v20.s[0] -ldr q12, [x0, #656] -mla v1.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v1.4s -add v4.4s, v4.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v2.s[0] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v28.4S, v5.s[0] -mul v28.4S, v28.4S,v2.s[0] -mla v28.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v28.4s -add v12.4s, v12.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v16.s[1] -mul v4.4S, v4.4S,v20.s[1] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v15.4S, v16.s[2] -mul v15.4S, v15.4S,v20.s[2] -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v5.s[1] -mul v12.4S, v12.4S,v2.s[1] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v2.s[2] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v13.4s -add v1.4s, v1.4s, v13.4s -trn1 v13.4S, v6.4S, v28.4S -trn2 v10.4S, v6.4S, v28.4S -trn1 v17.4S, v3.4S, v4.4S -trn2 v29.4S, v3.4S, v4.4S -trn2 v3.2D, v13.2D, v17.2D -trn2 v4.2D, v10.2D, v29.2D -trn1 v6.2D, v13.2D, v17.2D -trn1 v28.2D, v10.2D, v29.2D -ldr q29, [x17, #+416] -ldr q10, [x17, #+432] -trn1 v17.4S, v11.4S, v15.4S -trn2 v13.4S, v11.4S, v15.4S -trn1 v9.4S, v1.4S, v12.4S -trn2 v27.4S, v1.4S, v12.4S -trn2 v1.2D, v17.2D, v9.2D -trn2 v12.2D, v13.2D, v27.2D -trn1 v11.2D, v17.2D, v9.2D -trn1 v15.2D, v13.2D, v27.2D -ldr q27, [x17, #+1440] -ldr q13, [x17, #+1456] -sqrdmulh v9.4S, v3.4S, v10.4S -mul v3.4S, v3.4S,v29.4S -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v3.4s -add v6.4s, v6.4s, v3.4s -sqrdmulh v3.4S, v4.4S, v10.4S -mul v4.4S, v4.4S,v29.4S -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -sqrdmulh v4.4S, v1.4S, v13.4S -mul v1.4S, v1.4S,v27.4S -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -ldr q1, [x17, #+448] -ldr q17, [x17, #+464] -sqrdmulh v21.4S, v12.4S, v13.4S -mul v12.4S, v12.4S,v27.4S -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -ldr q12, [x17, #+480] -ldr q14, [x17, #+496] -sqrdmulh v22.4S, v28.4S, v17.4S -mul v28.4S, v28.4S,v1.4S -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -ldr q28, [x17, #+1472] -ldr q30, [x17, #+1488] -sqrdmulh v18.4S, v3.4S, v14.4S -mul v3.4S, v3.4S,v12.4S -mla v3.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -ldr q3, [x17, #+1504] -ldr q19, [x17, #+1520] -sqrdmulh v7.4S, v15.4S, v30.4S -ldr q8, [x0, #224] -mul v15.4S, v15.4S,v28.4S -mla v15.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v19.4S -ldr q0, [x0, #240] -mul v21.4S, v21.4S,v3.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v21.4s -add v4.4s, v4.4s, v21.4s -str q6, [x0, #128] -str q22, [x0, #144] -str q9, [x0, #160] -str q18, [x0, #176] -str q11, [x0, #640] -str q7, [x0, #656] -str q4, [x0, #672] -str q15, [x0, #688] -ldr q19, [x17, #+512] -ldr q3, [x17, #+528] -ldr q30, [x0, #192] -ldr q28, [x0, #208] -ldr q13, [x17, #+1536] -ldr q27, [x17, #+1552] -sqrdmulh v5.4S, v8.4S, v3.s[0] -ldr q2, [x0, #736] -mul v8.4S, v8.4S,v19.s[0] -ldr q15, [x0, #752] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v3.s[0] -ldr q4, [x0, #704] -mul v0.4S, v0.4S,v19.s[0] -ldr q7, [x0, #720] -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v28.4s, v0.4s -add v28.4s, v28.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -sqrdmulh v2.4S, v15.4S, v27.s[0] -mul v15.4S, v15.4S,v13.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v7.4s, v15.4s -add v7.4s, v7.4s, v15.4s -sqrdmulh v15.4S, v28.4S, v3.s[1] -mul v28.4S, v28.4S,v19.s[1] -mla v28.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v3.s[2] -mul v8.4S, v8.4S,v19.s[2] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v13.s[1] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -sqrdmulh v7.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v13.s[2] -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -trn1 v2.4S, v30.4S, v15.4S -trn2 v11.4S, v30.4S, v15.4S -trn1 v14.4S, v5.4S, v28.4S -trn2 v12.4S, v5.4S, v28.4S -trn2 v5.2D, v2.2D, v14.2D -trn2 v28.2D, v11.2D, v12.2D -trn1 v30.2D, v2.2D, v14.2D -trn1 v15.2D, v11.2D, v12.2D -ldr q12, [x17, #+544] -ldr q11, [x17, #+560] -trn1 v14.4S, v4.4S, v8.4S -trn2 v2.4S, v4.4S, v8.4S -trn1 v17.4S, v0.4S, v7.4S -trn2 v1.4S, v0.4S, v7.4S -trn2 v0.2D, v14.2D, v17.2D -trn2 v7.2D, v2.2D, v1.2D -trn1 v4.2D, v14.2D, v17.2D -trn1 v8.2D, v2.2D, v1.2D -ldr q1, [x17, #+1568] -ldr q2, [x17, #+1584] -sqrdmulh v17.4S, v5.4S, v11.4S -mul v5.4S, v5.4S,v12.4S -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v11.4S -mul v28.4S, v28.4S,v12.4S -mla v28.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v28.4s -add v15.4s, v15.4s, v28.4s -sqrdmulh v28.4S, v0.4S, v2.4S -mul v0.4S, v0.4S,v1.4S -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v4.4s, v0.4s -add v4.4s, v4.4s, v0.4s -ldr q0, [x17, #+576] -ldr q14, [x17, #+592] -sqrdmulh v10.4S, v7.4S, v2.4S -mul v7.4S, v7.4S,v1.4S -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v7.4s -add v8.4s, v8.4s, v7.4s -ldr q7, [x17, #+608] -ldr q29, [x17, #+624] -sqrdmulh v16.4S, v15.4S, v14.4S -mul v15.4S, v15.4S,v0.4S -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -ldr q15, [x17, #+1600] -ldr q20, [x17, #+1616] -sqrdmulh v18.4S, v5.4S, v29.4S -mul v5.4S, v5.4S,v7.4S -mla v5.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -ldr q5, [x17, #+1632] -ldr q9, [x17, #+1648] -sqrdmulh v22.4S, v8.4S, v20.4S -ldr q6, [x0, #288] -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v4.4s, v8.4s -add v4.4s, v4.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v9.4S -ldr q21, [x0, #304] -mul v10.4S, v10.4S,v5.4S -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v28.4s, v10.4s -add v28.4s, v28.4s, v10.4s -str q30, [x0, #192] -str q16, [x0, #208] -str q17, [x0, #224] -str q18, [x0, #240] -str q4, [x0, #704] -str q22, [x0, #720] -str q28, [x0, #736] -str q8, [x0, #752] -ldr q9, [x17, #+640] -ldr q5, [x17, #+656] -ldr q20, [x0, #256] -ldr q15, [x0, #272] -ldr q2, [x17, #+1664] -ldr q1, [x17, #+1680] -sqrdmulh v27.4S, v6.4S, v5.s[0] -ldr q13, [x0, #800] -mul v6.4S, v6.4S,v9.s[0] -ldr q8, [x0, #816] -mla v6.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v6.4s -add v20.4s, v20.4s, v6.4s -sqrdmulh v6.4S, v21.4S, v5.s[0] -ldr q28, [x0, #768] -mul v21.4S, v21.4S,v9.s[0] -ldr q22, [x0, #784] -mla v21.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v1.s[0] -mul v13.4S, v13.4S,v2.s[0] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v28.4s, v13.4s -add v28.4s, v28.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v1.s[0] -mul v8.4S, v8.4S,v2.s[0] -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v8.4s -add v22.4s, v22.4s, v8.4s -sqrdmulh v8.4S, v15.4S, v5.s[1] -mul v15.4S, v15.4S,v9.s[1] -mla v15.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v6.4S, v5.s[2] -mul v6.4S, v6.4S,v9.s[2] -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v27.4s, v6.4s -add v27.4s, v27.4s, v6.4s -sqrdmulh v6.4S, v22.4S, v1.s[1] -mul v22.4S, v22.4S,v2.s[1] -mla v22.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v22.4s -add v28.4s, v28.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v1.s[2] -mul v13.4S, v13.4S,v2.s[2] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -trn1 v13.4S, v20.4S, v8.4S -trn2 v4.4S, v20.4S, v8.4S -trn1 v29.4S, v27.4S, v15.4S -trn2 v7.4S, v27.4S, v15.4S -trn2 v27.2D, v13.2D, v29.2D -trn2 v15.2D, v4.2D, v7.2D -trn1 v20.2D, v13.2D, v29.2D -trn1 v8.2D, v4.2D, v7.2D -ldr q7, [x17, #+672] -ldr q4, [x17, #+688] -trn1 v29.4S, v28.4S, v6.4S -trn2 v13.4S, v28.4S, v6.4S -trn1 v14.4S, v21.4S, v22.4S -trn2 v0.4S, v21.4S, v22.4S -trn2 v21.2D, v29.2D, v14.2D -trn2 v22.2D, v13.2D, v0.2D -trn1 v28.2D, v29.2D, v14.2D -trn1 v6.2D, v13.2D, v0.2D -ldr q0, [x17, #+1696] -ldr q13, [x17, #+1712] -sqrdmulh v14.4S, v27.4S, v4.4S -mul v27.4S, v27.4S,v7.4S -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v4.4S -mul v15.4S, v15.4S,v7.4S -mla v15.4S, v27.4S, v31.s[0] -sub v27.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v13.4S -mul v21.4S, v21.4S,v0.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v21.4s -add v28.4s, v28.4s, v21.4s -ldr q21, [x17, #+704] -ldr q29, [x17, #+720] -sqrdmulh v11.4S, v22.4S, v13.4S -mul v22.4S, v22.4S,v0.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v6.4s, v22.4s -add v6.4s, v6.4s, v22.4s -ldr q22, [x17, #+736] -ldr q12, [x17, #+752] -sqrdmulh v3.4S, v8.4S, v29.4S -mul v8.4S, v8.4S,v21.4S -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v8.4s -add v20.4s, v20.4s, v8.4s -ldr q8, [x17, #+1728] -ldr q19, [x17, #+1744] -sqrdmulh v18.4S, v27.4S, v12.4S -mul v27.4S, v27.4S,v22.4S -mla v27.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v27.4s -add v14.4s, v14.4s, v27.4s -ldr q27, [x17, #+1760] -ldr q17, [x17, #+1776] -sqrdmulh v16.4S, v6.4S, v19.4S -ldr q30, [x0, #352] -mul v6.4S, v6.4S,v8.4S -mla v6.4S, v16.4S, v31.s[0] -sub v16.4s, v28.4s, v6.4s -add v28.4s, v28.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v17.4S -ldr q10, [x0, #368] -mul v11.4S, v11.4S,v27.4S -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -str q20, [x0, #256] -str q3, [x0, #272] -str q14, [x0, #288] -str q18, [x0, #304] -str q28, [x0, #768] -str q16, [x0, #784] -str q15, [x0, #800] -str q6, [x0, #816] -ldr q17, [x17, #+768] -ldr q27, [x17, #+784] -ldr q19, [x0, #320] -ldr q8, [x0, #336] -ldr q13, [x17, #+1792] -ldr q0, [x17, #+1808] -sqrdmulh v1.4S, v30.4S, v27.s[0] -ldr q2, [x0, #864] -mul v30.4S, v30.4S,v17.s[0] -ldr q6, [x0, #880] -mla v30.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v10.4S, v27.s[0] -ldr q15, [x0, #832] -mul v10.4S, v10.4S,v17.s[0] -ldr q16, [x0, #848] -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v0.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v0.s[0] -mul v6.4S, v6.4S,v13.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v6.4s -add v16.4s, v16.4s, v6.4s -sqrdmulh v6.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v17.s[1] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v30.4S, v27.s[2] -mul v30.4S, v30.4S,v17.s[2] -mla v30.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v0.s[1] -mul v16.4S, v16.4S,v13.s[1] -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -sqrdmulh v16.4S, v2.4S, v0.s[2] -mul v2.4S, v2.4S,v13.s[2] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -trn1 v2.4S, v19.4S, v6.4S -trn2 v28.4S, v19.4S, v6.4S -trn1 v12.4S, v1.4S, v8.4S -trn2 v22.4S, v1.4S, v8.4S -trn2 v1.2D, v2.2D, v12.2D -trn2 v8.2D, v28.2D, v22.2D -trn1 v19.2D, v2.2D, v12.2D -trn1 v6.2D, v28.2D, v22.2D -ldr q22, [x17, #+800] -ldr q28, [x17, #+816] -trn1 v12.4S, v15.4S, v30.4S -trn2 v2.4S, v15.4S, v30.4S -trn1 v29.4S, v10.4S, v16.4S -trn2 v21.4S, v10.4S, v16.4S -trn2 v10.2D, v12.2D, v29.2D -trn2 v16.2D, v2.2D, v21.2D -trn1 v15.2D, v12.2D, v29.2D -trn1 v30.2D, v2.2D, v21.2D -ldr q21, [x17, #+1824] -ldr q2, [x17, #+1840] -sqrdmulh v29.4S, v1.4S, v28.4S -mul v1.4S, v1.4S,v22.4S -mla v1.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v28.4S -mul v8.4S, v8.4S,v22.4S -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v2.4S -mul v10.4S, v10.4S,v21.4S -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -ldr q10, [x17, #+832] -ldr q12, [x17, #+848] -sqrdmulh v4.4S, v16.4S, v2.4S -mul v16.4S, v16.4S,v21.4S -mla v16.4S, v4.4S, v31.s[0] -sub v4.4s, v30.4s, v16.4s -add v30.4s, v30.4s, v16.4s -ldr q16, [x17, #+864] -ldr q7, [x17, #+880] -sqrdmulh v5.4S, v6.4S, v12.4S -mul v6.4S, v6.4S,v10.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -ldr q6, [x17, #+1856] -ldr q9, [x17, #+1872] -sqrdmulh v18.4S, v1.4S, v7.4S -mul v1.4S, v1.4S,v16.4S -mla v1.4S, v18.4S, v31.s[0] -sub v18.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -ldr q1, [x17, #+1888] -ldr q14, [x17, #+1904] -sqrdmulh v3.4S, v30.4S, v9.4S -ldr q20, [x0, #416] -mul v30.4S, v30.4S,v6.4S -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v14.4S -ldr q11, [x0, #432] -mul v4.4S, v4.4S,v1.4S -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -str q19, [x0, #320] -str q5, [x0, #336] -str q29, [x0, #352] -str q18, [x0, #368] -str q15, [x0, #832] -str q3, [x0, #848] -str q8, [x0, #864] -str q30, [x0, #880] -ldr q14, [x17, #+896] -ldr q1, [x17, #+912] -ldr q9, [x0, #384] -ldr q6, [x0, #400] -ldr q2, [x17, #+1920] -ldr q21, [x17, #+1936] -sqrdmulh v0.4S, v20.4S, v1.s[0] -ldr q13, [x0, #928] -mul v20.4S, v20.4S,v14.s[0] -ldr q30, [x0, #944] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v20.4s -add v9.4s, v9.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v1.s[0] -ldr q8, [x0, #896] -mul v11.4S, v11.4S,v14.s[0] -ldr q3, [x0, #912] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v11.4s -add v6.4s, v6.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v21.s[0] -mul v13.4S, v13.4S,v2.s[0] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v21.s[0] -mul v30.4S, v30.4S,v2.s[0] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v3.4s, v30.4s -add v3.4s, v3.4s, v30.4s -sqrdmulh v30.4S, v6.4S, v1.s[1] -mul v6.4S, v6.4S,v14.s[1] -mla v6.4S, v30.4S, v31.s[0] -sub v30.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -sqrdmulh v6.4S, v20.4S, v1.s[2] -mul v20.4S, v20.4S,v14.s[2] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v3.4S, v21.s[1] -mul v3.4S, v3.4S,v2.s[1] -mla v3.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v13.4S, v21.s[2] -mul v13.4S, v13.4S,v2.s[2] -mla v13.4S, v3.4S, v31.s[0] -sub v3.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -trn1 v13.4S, v9.4S, v30.4S -trn2 v15.4S, v9.4S, v30.4S -trn1 v7.4S, v0.4S, v6.4S -trn2 v16.4S, v0.4S, v6.4S -trn2 v0.2D, v13.2D, v7.2D -trn2 v6.2D, v15.2D, v16.2D -trn1 v9.2D, v13.2D, v7.2D -trn1 v30.2D, v15.2D, v16.2D -ldr q16, [x17, #+928] -ldr q15, [x17, #+944] -trn1 v7.4S, v8.4S, v20.4S -trn2 v13.4S, v8.4S, v20.4S -trn1 v12.4S, v11.4S, v3.4S -trn2 v10.4S, v11.4S, v3.4S -trn2 v11.2D, v7.2D, v12.2D -trn2 v3.2D, v13.2D, v10.2D -trn1 v8.2D, v7.2D, v12.2D -trn1 v20.2D, v13.2D, v10.2D -ldr q10, [x17, #+1952] -ldr q13, [x17, #+1968] -sqrdmulh v12.4S, v0.4S, v15.4S -mul v0.4S, v0.4S,v16.4S -mla v0.4S, v12.4S, v31.s[0] -sub v12.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v6.4S, v15.4S -mul v6.4S, v6.4S,v16.4S -mla v6.4S, v0.4S, v31.s[0] -sub v0.4s, v30.4s, v6.4s -add v30.4s, v30.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v13.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -ldr q11, [x17, #+960] -ldr q7, [x17, #+976] -sqrdmulh v28.4S, v3.4S, v13.4S -mul v3.4S, v3.4S,v10.4S -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v20.4s, v3.4s -add v20.4s, v20.4s, v3.4s -ldr q3, [x17, #+992] -ldr q22, [x17, #+1008] -sqrdmulh v27.4S, v30.4S, v7.4S -mul v30.4S, v30.4S,v11.4S -mla v30.4S, v27.4S, v31.s[0] -sub v27.4s, v9.4s, v30.4s -add v9.4s, v9.4s, v30.4s -ldr q30, [x17, #+1984] -ldr q17, [x17, #+2000] -sqrdmulh v18.4S, v0.4S, v22.4S -mul v0.4S, v0.4S,v3.4S -mla v0.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -ldr q0, [x17, #+2016] -ldr q29, [x17, #+2032] -sqrdmulh v5.4S, v20.4S, v17.4S -ldr q19, [x0, #480] -mul v20.4S, v20.4S,v30.4S -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.4S -ldr q4, [x0, #496] -mul v28.4S, v28.4S,v0.4S -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -str q9, [x0, #384] -str q27, [x0, #400] -str q12, [x0, #416] -str q18, [x0, #432] -str q8, [x0, #896] -str q5, [x0, #912] -str q6, [x0, #928] -str q20, [x0, #944] -ldr q29, [x17, #+1024] -ldr q0, [x17, #+1040] -ldr q17, [x0, #448] -ldr q30, [x0, #464] -ldr q13, [x17, #+2048] -ldr q10, [x17, #+2064] -sqrdmulh v21.4S, v19.4S, v0.s[0] -ldr q2, [x0, #992] -mul v19.4S, v19.4S,v29.s[0] -ldr q20, [x0, #1008] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v4.4S, v0.s[0] -ldr q6, [x0, #960] -mul v4.4S, v4.4S,v29.s[0] -ldr q5, [x0, #976] -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v4.4s -add v30.4s, v30.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v10.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v2.4s -add v6.4s, v6.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v10.s[0] -mul v20.4S, v20.4S,v13.s[0] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v0.s[1] -mul v30.4S, v30.4S,v29.s[1] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v30.4s -add v17.4s, v17.4s, v30.4s -sqrdmulh v30.4S, v19.4S, v0.s[2] -mul v19.4S, v19.4S,v29.s[2] -mla v19.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v5.4S, v10.s[1] -mul v5.4S, v5.4S,v13.s[1] -mla v5.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v2.4S, v10.s[2] -mul v2.4S, v2.4S,v13.s[2] -mla v2.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -trn1 v2.4S, v17.4S, v20.4S -trn2 v8.4S, v17.4S, v20.4S -trn1 v22.4S, v21.4S, v30.4S -trn2 v3.4S, v21.4S, v30.4S -trn2 v21.2D, v2.2D, v22.2D -trn2 v30.2D, v8.2D, v3.2D -trn1 v17.2D, v2.2D, v22.2D -trn1 v20.2D, v8.2D, v3.2D -ldr q3, [x17, #+1056] -ldr q8, [x17, #+1072] -trn1 v22.4S, v6.4S, v19.4S -trn2 v2.4S, v6.4S, v19.4S -trn1 v7.4S, v4.4S, v5.4S -trn2 v11.4S, v4.4S, v5.4S -trn2 v4.2D, v22.2D, v7.2D -trn2 v5.2D, v2.2D, v11.2D -trn1 v6.2D, v22.2D, v7.2D -trn1 v19.2D, v2.2D, v11.2D -ldr q11, [x17, #+2080] -ldr q2, [x17, #+2096] -sqrdmulh v7.4S, v21.4S, v8.4S -mul v21.4S, v21.4S,v3.4S -mla v21.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v8.4S -mul v30.4S, v30.4S,v3.4S -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v30.4s -add v20.4s, v20.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v2.4S -mul v4.4S, v4.4S,v11.4S -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x17, #+1088] -ldr q22, [x17, #+1104] -sqrdmulh v15.4S, v5.4S, v2.4S -mul v5.4S, v5.4S,v11.4S -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -ldr q5, [x17, #+1120] -ldr q16, [x17, #+1136] -sqrdmulh v1.4S, v20.4S, v22.4S -mul v20.4S, v20.4S,v4.4S -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -ldr q20, [x17, #+2112] -ldr q14, [x17, #+2128] -sqrdmulh v18.4S, v21.4S, v16.4S -mul v21.4S, v21.4S,v5.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v7.4s, v21.4s -add v7.4s, v7.4s, v21.4s -ldr q21, [x17, #+2144] -ldr q12, [x17, #+2160] -sqrdmulh v27.4S, v19.4S, v14.4S -mul v19.4S, v19.4S,v20.4S -mla v19.4S, v27.4S, v31.s[0] -sub v27.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -sqrdmulh v19.4S, v15.4S, v12.4S -mul v15.4S, v15.4S,v21.4S -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -str q17, [x0, #448] -str q1, [x0, #464] -str q7, [x0, #480] -str q18, [x0, #496] -str q6, [x0, #960] -str q27, [x0, #976] -str q30, [x0, #992] -str q19, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s deleted file mode 100644 index b9b1089..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_4.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z2_4 -.global _ntt_u32_full_neon_asm_var_4_4_3_z2_4 -ntt_u32_full_neon_asm_var_4_4_3_z2_4: -_ntt_u32_full_neon_asm_var_4_4_3_z2_4: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x17, #+128] -ldr q7, [x17, #+144] -ldr q15, [x0, #0] -ldr q10, [x0, #16] -ldr q2, [x17, #+1152] -ldr q16, [x17, #+1168] -sqrdmulh v22.4S, v4.4S, v7.s[0] -ldr q13, [x0, #544] -mul v4.4S, v4.4S,v6.s[0] -ldr q11, [x0, #560] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v5.4S, v7.s[0] -ldr q21, [x0, #512] -mul v5.4S, v5.4S,v6.s[0] -ldr q14, [x0, #528] -mla v5.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v16.s[0] -mul v13.4S, v13.4S,v2.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v16.s[0] -mul v11.4S, v11.4S,v2.s[0] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v11.4s -add v14.4s, v14.4s, v11.4s -sqrdmulh v11.4S, v10.4S, v7.s[1] -mul v10.4S, v10.4S,v6.s[1] -mla v10.4S, v11.4S, v31.s[0] -sub v11.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v7.s[2] -mul v4.4S, v4.4S,v6.s[2] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v14.4S, v16.s[1] -mul v14.4S, v14.4S,v2.s[1] -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v21.4s, v14.4s -trn1 v0.4S, v15.4S, v11.4S -trn2 v19.4S, v15.4S, v11.4S -add v21.4s, v21.4s, v14.4s -trn1 v14.4S, v22.4S, v10.4S -trn2 v17.4S, v22.4S, v10.4S -sqrdmulh v20.4S, v13.4S, v16.s[2] -ldr q3, [x17, #+160] -mul v13.4S, v13.4S,v2.s[2] -ldr q1, [x17, #+176] -mla v13.4S, v20.4S, v31.s[0] -trn2 v22.2D, v0.2D, v14.2D -trn2 v10.2D, v19.2D, v17.2D -sub v20.4s, v5.4s, v13.4s -trn1 v15.2D, v0.2D, v14.2D -trn1 v11.2D, v19.2D, v17.2D -add v5.4s, v5.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v1.4S -mul v22.4S, v22.4S,v3.4S -mla v22.4S, v13.4S, v31.s[0] -trn1 v13.4S, v21.4S, v4.4S -trn2 v17.4S, v21.4S, v4.4S -sub v19.4s, v15.4s, v22.4s -trn1 v14.4S, v5.4S, v20.4S -trn2 v0.4S, v5.4S, v20.4S -add v15.4s, v15.4s, v22.4s -trn2 v5.2D, v13.2D, v14.2D -trn2 v20.2D, v17.2D, v0.2D -sqrdmulh v22.4S, v10.4S, v1.4S -trn1 v21.2D, v13.2D, v14.2D -trn1 v4.2D, v17.2D, v0.2D -ldr q0, [x17, #+1184] -ldr q17, [x17, #+1200] -mul v10.4S, v10.4S,v3.4S -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v0.4S -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -ldr q5, [x17, #+192] -ldr q14, [x17, #+208] -sqrdmulh v13.4S, v20.4S, v17.4S -mul v20.4S, v20.4S,v0.4S -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -ldr q20, [x17, #+224] -ldr q9, [x17, #+240] -sqrdmulh v12.4S, v11.4S, v14.4S -mul v11.4S, v11.4S,v5.4S -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -ldr q11, [x17, #+1216] -ldr q8, [x17, #+1232] -sqrdmulh v18.4S, v22.4S, v9.4S -mul v22.4S, v22.4S,v20.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -ldr q22, [x17, #+1248] -ldr q30, [x17, #+1264] -sqrdmulh v29.4S, v4.4S, v8.4S -ldr q28, [x0, #96] -mul v4.4S, v4.4S,v11.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -sqrdmulh v4.4S, v13.4S, v30.4S -ldr q27, [x0, #112] -mul v13.4S, v13.4S,v22.4S -mla v13.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -str q15, [x0, #0] -str q12, [x0, #16] -str q19, [x0, #32] -str q18, [x0, #48] -str q21, [x0, #512] -str q29, [x0, #528] -str q10, [x0, #544] -str q4, [x0, #560] -ldr q30, [x17, #+256] -ldr q22, [x17, #+272] -ldr q8, [x0, #64] -ldr q11, [x0, #80] -ldr q17, [x17, #+1280] -ldr q0, [x17, #+1296] -sqrdmulh v16.4S, v28.4S, v22.s[0] -ldr q2, [x0, #608] -mul v28.4S, v28.4S,v30.s[0] -ldr q4, [x0, #624] -mla v28.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v28.4s -add v8.4s, v8.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v22.s[0] -ldr q10, [x0, #576] -mul v27.4S, v27.4S,v30.s[0] -ldr q29, [x0, #592] -mla v27.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v27.4s -add v11.4s, v11.4s, v27.4s -sqrdmulh v27.4S, v2.4S, v0.s[0] -mul v2.4S, v2.4S,v17.s[0] -mla v2.4S, v27.4S, v31.s[0] -sub v27.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v0.s[0] -mul v4.4S, v4.4S,v17.s[0] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v30.s[1] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v22.s[2] -mul v28.4S, v28.4S,v30.s[2] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v28.4s -add v16.4s, v16.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v0.s[1] -mul v29.4S, v29.4S,v17.s[1] -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v29.4s -trn1 v21.4S, v8.4S, v4.4S -trn2 v9.4S, v8.4S, v4.4S -add v10.4s, v10.4s, v29.4s -trn1 v29.4S, v16.4S, v11.4S -trn2 v20.4S, v16.4S, v11.4S -sqrdmulh v14.4S, v2.4S, v0.s[2] -ldr q5, [x17, #+288] -mul v2.4S, v2.4S,v17.s[2] -ldr q1, [x17, #+304] -mla v2.4S, v14.4S, v31.s[0] -trn2 v16.2D, v21.2D, v29.2D -trn2 v11.2D, v9.2D, v20.2D -sub v14.4s, v27.4s, v2.4s -trn1 v8.2D, v21.2D, v29.2D -trn1 v4.2D, v9.2D, v20.2D -add v27.4s, v27.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v1.4S -mul v16.4S, v16.4S,v5.4S -mla v16.4S, v2.4S, v31.s[0] -trn1 v2.4S, v10.4S, v28.4S -trn2 v20.4S, v10.4S, v28.4S -sub v9.4s, v8.4s, v16.4s -trn1 v29.4S, v27.4S, v14.4S -trn2 v21.4S, v27.4S, v14.4S -add v8.4s, v8.4s, v16.4s -trn2 v27.2D, v2.2D, v29.2D -trn2 v14.2D, v20.2D, v21.2D -sqrdmulh v16.4S, v11.4S, v1.4S -trn1 v10.2D, v2.2D, v29.2D -trn1 v28.2D, v20.2D, v21.2D -ldr q21, [x17, #+1312] -ldr q20, [x17, #+1328] -mul v11.4S, v11.4S,v5.4S -mla v11.4S, v16.4S, v31.s[0] -sub v16.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v27.4S, v20.4S -mul v27.4S, v27.4S,v21.4S -mla v27.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v27.4s -add v10.4s, v10.4s, v27.4s -ldr q27, [x17, #+320] -ldr q29, [x17, #+336] -sqrdmulh v2.4S, v14.4S, v20.4S -mul v14.4S, v14.4S,v21.4S -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v28.4s, v14.4s -add v28.4s, v28.4s, v14.4s -ldr q14, [x17, #+352] -ldr q3, [x17, #+368] -sqrdmulh v7.4S, v4.4S, v29.4S -mul v4.4S, v4.4S,v27.4S -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -ldr q4, [x17, #+1344] -ldr q6, [x17, #+1360] -sqrdmulh v18.4S, v16.4S, v3.4S -mul v16.4S, v16.4S,v14.4S -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v16.4s -add v9.4s, v9.4s, v16.4s -ldr q16, [x17, #+1376] -ldr q19, [x17, #+1392] -sqrdmulh v12.4S, v28.4S, v6.4S -ldr q15, [x0, #160] -mul v28.4S, v28.4S,v4.4S -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v2.4S, v19.4S -ldr q13, [x0, #176] -mul v2.4S, v2.4S,v16.4S -mla v2.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -str q8, [x0, #64] -str q7, [x0, #80] -str q9, [x0, #96] -str q18, [x0, #112] -str q10, [x0, #576] -str q12, [x0, #592] -str q11, [x0, #608] -str q28, [x0, #624] -ldr q19, [x17, #+384] -ldr q16, [x17, #+400] -ldr q6, [x0, #128] -ldr q4, [x0, #144] -ldr q20, [x17, #+1408] -ldr q21, [x17, #+1424] -sqrdmulh v0.4S, v15.4S, v16.s[0] -ldr q17, [x0, #672] -mul v15.4S, v15.4S,v19.s[0] -ldr q28, [x0, #688] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v6.4s, v15.4s -add v6.4s, v6.4s, v15.4s -sqrdmulh v15.4S, v13.4S, v16.s[0] -ldr q11, [x0, #640] -mul v13.4S, v13.4S,v19.s[0] -ldr q12, [x0, #656] -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v13.4s -add v4.4s, v4.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v21.s[0] -mul v17.4S, v17.4S,v20.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v28.4S, v21.s[0] -mul v28.4S, v28.4S,v20.s[0] -mla v28.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v28.4s -add v12.4s, v12.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v16.s[1] -mul v4.4S, v4.4S,v19.s[1] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v15.4S, v16.s[2] -mul v15.4S, v15.4S,v19.s[2] -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v21.s[1] -mul v12.4S, v12.4S,v20.s[1] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v12.4s -trn1 v10.4S, v6.4S, v28.4S -trn2 v3.4S, v6.4S, v28.4S -add v11.4s, v11.4s, v12.4s -trn1 v12.4S, v0.4S, v4.4S -trn2 v14.4S, v0.4S, v4.4S -sqrdmulh v29.4S, v17.4S, v21.s[2] -ldr q27, [x17, #+416] -mul v17.4S, v17.4S,v20.s[2] -ldr q1, [x17, #+432] -mla v17.4S, v29.4S, v31.s[0] -trn2 v0.2D, v10.2D, v12.2D -trn2 v4.2D, v3.2D, v14.2D -sub v29.4s, v13.4s, v17.4s -trn1 v6.2D, v10.2D, v12.2D -trn1 v28.2D, v3.2D, v14.2D -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v1.4S -mul v0.4S, v0.4S,v27.4S -mla v0.4S, v17.4S, v31.s[0] -trn1 v17.4S, v11.4S, v15.4S -trn2 v14.4S, v11.4S, v15.4S -sub v3.4s, v6.4s, v0.4s -trn1 v12.4S, v13.4S, v29.4S -trn2 v10.4S, v13.4S, v29.4S -add v6.4s, v6.4s, v0.4s -trn2 v13.2D, v17.2D, v12.2D -trn2 v29.2D, v14.2D, v10.2D -sqrdmulh v0.4S, v4.4S, v1.4S -trn1 v11.2D, v17.2D, v12.2D -trn1 v15.2D, v14.2D, v10.2D -ldr q10, [x17, #+1440] -ldr q14, [x17, #+1456] -mul v4.4S, v4.4S,v27.4S -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -sqrdmulh v4.4S, v13.4S, v14.4S -mul v13.4S, v13.4S,v10.4S -mla v13.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -ldr q13, [x17, #+448] -ldr q12, [x17, #+464] -sqrdmulh v17.4S, v29.4S, v14.4S -mul v29.4S, v29.4S,v10.4S -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -ldr q29, [x17, #+480] -ldr q5, [x17, #+496] -sqrdmulh v22.4S, v28.4S, v12.4S -mul v28.4S, v28.4S,v13.4S -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -ldr q28, [x17, #+1472] -ldr q30, [x17, #+1488] -sqrdmulh v18.4S, v0.4S, v5.4S -mul v0.4S, v0.4S,v29.4S -mla v0.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v0.4s -add v3.4s, v3.4s, v0.4s -ldr q0, [x17, #+1504] -ldr q9, [x17, #+1520] -sqrdmulh v7.4S, v15.4S, v30.4S -ldr q8, [x0, #224] -mul v15.4S, v15.4S,v28.4S -mla v15.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v17.4S, v9.4S -ldr q2, [x0, #240] -mul v17.4S, v17.4S,v0.4S -mla v17.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v17.4s -add v4.4s, v4.4s, v17.4s -str q6, [x0, #128] -str q22, [x0, #144] -str q3, [x0, #160] -str q18, [x0, #176] -str q11, [x0, #640] -str q7, [x0, #656] -str q4, [x0, #672] -str q15, [x0, #688] -ldr q9, [x17, #+512] -ldr q0, [x17, #+528] -ldr q30, [x0, #192] -ldr q28, [x0, #208] -ldr q14, [x17, #+1536] -ldr q10, [x17, #+1552] -sqrdmulh v21.4S, v8.4S, v0.s[0] -ldr q20, [x0, #736] -mul v8.4S, v8.4S,v9.s[0] -ldr q15, [x0, #752] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v0.s[0] -ldr q4, [x0, #704] -mul v2.4S, v2.4S,v9.s[0] -ldr q7, [x0, #720] -mla v2.4S, v8.4S, v31.s[0] -sub v8.4s, v28.4s, v2.4s -add v28.4s, v28.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v10.s[0] -mul v20.4S, v20.4S,v14.s[0] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v10.s[0] -mul v15.4S, v15.4S,v14.s[0] -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v7.4s, v15.4s -add v7.4s, v7.4s, v15.4s -sqrdmulh v15.4S, v28.4S, v0.s[1] -mul v28.4S, v28.4S,v9.s[1] -mla v28.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v0.s[2] -mul v8.4S, v8.4S,v9.s[2] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v7.4S, v10.s[1] -mul v7.4S, v7.4S,v14.s[1] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v4.4s, v7.4s -trn1 v11.4S, v30.4S, v15.4S -trn2 v5.4S, v30.4S, v15.4S -add v4.4s, v4.4s, v7.4s -trn1 v7.4S, v21.4S, v28.4S -trn2 v29.4S, v21.4S, v28.4S -sqrdmulh v12.4S, v20.4S, v10.s[2] -ldr q13, [x17, #+544] -mul v20.4S, v20.4S,v14.s[2] -ldr q1, [x17, #+560] -mla v20.4S, v12.4S, v31.s[0] -trn2 v21.2D, v11.2D, v7.2D -trn2 v28.2D, v5.2D, v29.2D -sub v12.4s, v2.4s, v20.4s -trn1 v30.2D, v11.2D, v7.2D -trn1 v15.2D, v5.2D, v29.2D -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v1.4S -mul v21.4S, v21.4S,v13.4S -mla v21.4S, v20.4S, v31.s[0] -trn1 v20.4S, v4.4S, v8.4S -trn2 v29.4S, v4.4S, v8.4S -sub v5.4s, v30.4s, v21.4s -trn1 v7.4S, v2.4S, v12.4S -trn2 v11.4S, v2.4S, v12.4S -add v30.4s, v30.4s, v21.4s -trn2 v2.2D, v20.2D, v7.2D -trn2 v12.2D, v29.2D, v11.2D -sqrdmulh v21.4S, v28.4S, v1.4S -trn1 v4.2D, v20.2D, v7.2D -trn1 v8.2D, v29.2D, v11.2D -ldr q11, [x17, #+1568] -ldr q29, [x17, #+1584] -mul v28.4S, v28.4S,v13.4S -mla v28.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v28.4s -add v15.4s, v15.4s, v28.4s -sqrdmulh v28.4S, v2.4S, v29.4S -mul v2.4S, v2.4S,v11.4S -mla v2.4S, v28.4S, v31.s[0] -sub v28.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -ldr q2, [x17, #+576] -ldr q7, [x17, #+592] -sqrdmulh v20.4S, v12.4S, v29.4S -mul v12.4S, v12.4S,v11.4S -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v12.4s -add v8.4s, v8.4s, v12.4s -ldr q12, [x17, #+608] -ldr q27, [x17, #+624] -sqrdmulh v16.4S, v15.4S, v7.4S -mul v15.4S, v15.4S,v2.4S -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -ldr q15, [x17, #+1600] -ldr q19, [x17, #+1616] -sqrdmulh v18.4S, v21.4S, v27.4S -mul v21.4S, v21.4S,v12.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -ldr q21, [x17, #+1632] -ldr q3, [x17, #+1648] -sqrdmulh v22.4S, v8.4S, v19.4S -ldr q6, [x0, #288] -mul v8.4S, v8.4S,v15.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v4.4s, v8.4s -add v4.4s, v4.4s, v8.4s -sqrdmulh v8.4S, v20.4S, v3.4S -ldr q17, [x0, #304] -mul v20.4S, v20.4S,v21.4S -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v28.4s, v20.4s -add v28.4s, v28.4s, v20.4s -str q30, [x0, #192] -str q16, [x0, #208] -str q5, [x0, #224] -str q18, [x0, #240] -str q4, [x0, #704] -str q22, [x0, #720] -str q28, [x0, #736] -str q8, [x0, #752] -ldr q3, [x17, #+640] -ldr q21, [x17, #+656] -ldr q19, [x0, #256] -ldr q15, [x0, #272] -ldr q29, [x17, #+1664] -ldr q11, [x17, #+1680] -sqrdmulh v10.4S, v6.4S, v21.s[0] -ldr q14, [x0, #800] -mul v6.4S, v6.4S,v3.s[0] -ldr q8, [x0, #816] -mla v6.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -sqrdmulh v6.4S, v17.4S, v21.s[0] -ldr q28, [x0, #768] -mul v17.4S, v17.4S,v3.s[0] -ldr q22, [x0, #784] -mla v17.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v11.s[0] -mul v14.4S, v14.4S,v29.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v28.4s, v14.4s -add v28.4s, v28.4s, v14.4s -sqrdmulh v14.4S, v8.4S, v11.s[0] -mul v8.4S, v8.4S,v29.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v8.4s -add v22.4s, v22.4s, v8.4s -sqrdmulh v8.4S, v15.4S, v21.s[1] -mul v15.4S, v15.4S,v3.s[1] -mla v15.4S, v8.4S, v31.s[0] -sub v8.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v6.4S, v21.s[2] -mul v6.4S, v6.4S,v3.s[2] -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v22.4S, v11.s[1] -mul v22.4S, v22.4S,v29.s[1] -mla v22.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v22.4s -trn1 v4.4S, v19.4S, v8.4S -trn2 v27.4S, v19.4S, v8.4S -add v28.4s, v28.4s, v22.4s -trn1 v22.4S, v10.4S, v15.4S -trn2 v12.4S, v10.4S, v15.4S -sqrdmulh v7.4S, v14.4S, v11.s[2] -ldr q2, [x17, #+672] -mul v14.4S, v14.4S,v29.s[2] -ldr q1, [x17, #+688] -mla v14.4S, v7.4S, v31.s[0] -trn2 v10.2D, v4.2D, v22.2D -trn2 v15.2D, v27.2D, v12.2D -sub v7.4s, v17.4s, v14.4s -trn1 v19.2D, v4.2D, v22.2D -trn1 v8.2D, v27.2D, v12.2D -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v1.4S -mul v10.4S, v10.4S,v2.4S -mla v10.4S, v14.4S, v31.s[0] -trn1 v14.4S, v28.4S, v6.4S -trn2 v12.4S, v28.4S, v6.4S -sub v27.4s, v19.4s, v10.4s -trn1 v22.4S, v17.4S, v7.4S -trn2 v4.4S, v17.4S, v7.4S -add v19.4s, v19.4s, v10.4s -trn2 v17.2D, v14.2D, v22.2D -trn2 v7.2D, v12.2D, v4.2D -sqrdmulh v10.4S, v15.4S, v1.4S -trn1 v28.2D, v14.2D, v22.2D -trn1 v6.2D, v12.2D, v4.2D -ldr q4, [x17, #+1696] -ldr q12, [x17, #+1712] -mul v15.4S, v15.4S,v2.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v17.4S, v12.4S -mul v17.4S, v17.4S,v4.4S -mla v17.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v17.4s -add v28.4s, v28.4s, v17.4s -ldr q17, [x17, #+704] -ldr q22, [x17, #+720] -sqrdmulh v14.4S, v7.4S, v12.4S -mul v7.4S, v7.4S,v4.4S -mla v7.4S, v14.4S, v31.s[0] -sub v14.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -ldr q7, [x17, #+736] -ldr q13, [x17, #+752] -sqrdmulh v0.4S, v8.4S, v22.4S -mul v8.4S, v8.4S,v17.4S -mla v8.4S, v0.4S, v31.s[0] -sub v0.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -ldr q8, [x17, #+1728] -ldr q9, [x17, #+1744] -sqrdmulh v18.4S, v10.4S, v13.4S -mul v10.4S, v10.4S,v7.4S -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v27.4s, v10.4s -add v27.4s, v27.4s, v10.4s -ldr q10, [x17, #+1760] -ldr q5, [x17, #+1776] -sqrdmulh v16.4S, v6.4S, v9.4S -ldr q30, [x0, #352] -mul v6.4S, v6.4S,v8.4S -mla v6.4S, v16.4S, v31.s[0] -sub v16.4s, v28.4s, v6.4s -add v28.4s, v28.4s, v6.4s -sqrdmulh v6.4S, v14.4S, v5.4S -ldr q20, [x0, #368] -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -str q19, [x0, #256] -str q0, [x0, #272] -str q27, [x0, #288] -str q18, [x0, #304] -str q28, [x0, #768] -str q16, [x0, #784] -str q15, [x0, #800] -str q6, [x0, #816] -ldr q5, [x17, #+768] -ldr q10, [x17, #+784] -ldr q9, [x0, #320] -ldr q8, [x0, #336] -ldr q12, [x17, #+1792] -ldr q4, [x17, #+1808] -sqrdmulh v11.4S, v30.4S, v10.s[0] -ldr q29, [x0, #864] -mul v30.4S, v30.4S,v5.s[0] -ldr q6, [x0, #880] -mla v30.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v30.4s -add v9.4s, v9.4s, v30.4s -sqrdmulh v30.4S, v20.4S, v10.s[0] -ldr q15, [x0, #832] -mul v20.4S, v20.4S,v5.s[0] -ldr q16, [x0, #848] -mla v20.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v29.4S, v4.s[0] -mul v29.4S, v29.4S,v12.s[0] -mla v29.4S, v20.4S, v31.s[0] -sub v20.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v6.4S, v4.s[0] -mul v6.4S, v6.4S,v12.s[0] -mla v6.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v6.4s -add v16.4s, v16.4s, v6.4s -sqrdmulh v6.4S, v8.4S, v10.s[1] -mul v8.4S, v8.4S,v5.s[1] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v30.4S, v10.s[2] -mul v30.4S, v30.4S,v5.s[2] -mla v30.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v30.4s -add v11.4s, v11.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v4.s[1] -mul v16.4S, v16.4S,v12.s[1] -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v15.4s, v16.4s -trn1 v28.4S, v9.4S, v6.4S -trn2 v13.4S, v9.4S, v6.4S -add v15.4s, v15.4s, v16.4s -trn1 v16.4S, v11.4S, v8.4S -trn2 v7.4S, v11.4S, v8.4S -sqrdmulh v22.4S, v29.4S, v4.s[2] -ldr q17, [x17, #+800] -mul v29.4S, v29.4S,v12.s[2] -ldr q1, [x17, #+816] -mla v29.4S, v22.4S, v31.s[0] -trn2 v11.2D, v28.2D, v16.2D -trn2 v8.2D, v13.2D, v7.2D -sub v22.4s, v20.4s, v29.4s -trn1 v9.2D, v28.2D, v16.2D -trn1 v6.2D, v13.2D, v7.2D -add v20.4s, v20.4s, v29.4s -sqrdmulh v29.4S, v11.4S, v1.4S -mul v11.4S, v11.4S,v17.4S -mla v11.4S, v29.4S, v31.s[0] -trn1 v29.4S, v15.4S, v30.4S -trn2 v7.4S, v15.4S, v30.4S -sub v13.4s, v9.4s, v11.4s -trn1 v16.4S, v20.4S, v22.4S -trn2 v28.4S, v20.4S, v22.4S -add v9.4s, v9.4s, v11.4s -trn2 v20.2D, v29.2D, v16.2D -trn2 v22.2D, v7.2D, v28.2D -sqrdmulh v11.4S, v8.4S, v1.4S -trn1 v15.2D, v29.2D, v16.2D -trn1 v30.2D, v7.2D, v28.2D -ldr q28, [x17, #+1824] -ldr q7, [x17, #+1840] -mul v8.4S, v8.4S,v17.4S -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v20.4S, v7.4S -mul v20.4S, v20.4S,v28.4S -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x17, #+832] -ldr q16, [x17, #+848] -sqrdmulh v29.4S, v22.4S, v7.4S -mul v22.4S, v22.4S,v28.4S -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -ldr q22, [x17, #+864] -ldr q2, [x17, #+880] -sqrdmulh v21.4S, v6.4S, v16.4S -mul v6.4S, v6.4S,v20.4S -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -ldr q6, [x17, #+1856] -ldr q3, [x17, #+1872] -sqrdmulh v18.4S, v11.4S, v2.4S -mul v11.4S, v11.4S,v22.4S -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -ldr q11, [x17, #+1888] -ldr q27, [x17, #+1904] -sqrdmulh v0.4S, v30.4S, v3.4S -ldr q19, [x0, #416] -mul v30.4S, v30.4S,v6.4S -mla v30.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v27.4S -ldr q14, [x0, #432] -mul v29.4S, v29.4S,v11.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -str q9, [x0, #320] -str q21, [x0, #336] -str q13, [x0, #352] -str q18, [x0, #368] -str q15, [x0, #832] -str q0, [x0, #848] -str q8, [x0, #864] -str q30, [x0, #880] -ldr q27, [x17, #+896] -ldr q11, [x17, #+912] -ldr q3, [x0, #384] -ldr q6, [x0, #400] -ldr q7, [x17, #+1920] -ldr q28, [x17, #+1936] -sqrdmulh v4.4S, v19.4S, v11.s[0] -ldr q12, [x0, #928] -mul v19.4S, v19.4S,v27.s[0] -ldr q30, [x0, #944] -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v11.s[0] -ldr q8, [x0, #896] -mul v14.4S, v14.4S,v27.s[0] -ldr q0, [x0, #912] -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v14.4s -add v6.4s, v6.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v28.s[0] -mul v12.4S, v12.4S,v7.s[0] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v12.4s -add v8.4s, v8.4s, v12.4s -sqrdmulh v12.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v7.s[0] -mla v30.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -sqrdmulh v30.4S, v6.4S, v11.s[1] -mul v6.4S, v6.4S,v27.s[1] -mla v6.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v6.4s -add v3.4s, v3.4s, v6.4s -sqrdmulh v6.4S, v19.4S, v11.s[2] -mul v19.4S, v19.4S,v27.s[2] -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v19.4s -add v4.4s, v4.4s, v19.4s -sqrdmulh v19.4S, v0.4S, v28.s[1] -mul v0.4S, v0.4S,v7.s[1] -mla v0.4S, v19.4S, v31.s[0] -sub v19.4s, v8.4s, v0.4s -trn1 v15.4S, v3.4S, v30.4S -trn2 v2.4S, v3.4S, v30.4S -add v8.4s, v8.4s, v0.4s -trn1 v0.4S, v4.4S, v6.4S -trn2 v22.4S, v4.4S, v6.4S -sqrdmulh v16.4S, v12.4S, v28.s[2] -ldr q20, [x17, #+928] -mul v12.4S, v12.4S,v7.s[2] -ldr q1, [x17, #+944] -mla v12.4S, v16.4S, v31.s[0] -trn2 v4.2D, v15.2D, v0.2D -trn2 v6.2D, v2.2D, v22.2D -sub v16.4s, v14.4s, v12.4s -trn1 v3.2D, v15.2D, v0.2D -trn1 v30.2D, v2.2D, v22.2D -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v1.4S -mul v4.4S, v4.4S,v20.4S -mla v4.4S, v12.4S, v31.s[0] -trn1 v12.4S, v8.4S, v19.4S -trn2 v22.4S, v8.4S, v19.4S -sub v2.4s, v3.4s, v4.4s -trn1 v0.4S, v14.4S, v16.4S -trn2 v15.4S, v14.4S, v16.4S -add v3.4s, v3.4s, v4.4s -trn2 v14.2D, v12.2D, v0.2D -trn2 v16.2D, v22.2D, v15.2D -sqrdmulh v4.4S, v6.4S, v1.4S -trn1 v8.2D, v12.2D, v0.2D -trn1 v19.2D, v22.2D, v15.2D -ldr q15, [x17, #+1952] -ldr q22, [x17, #+1968] -mul v6.4S, v6.4S,v20.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v30.4s, v6.4s -add v30.4s, v30.4s, v6.4s -sqrdmulh v6.4S, v14.4S, v22.4S -mul v14.4S, v14.4S,v15.4S -mla v14.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -ldr q14, [x17, #+960] -ldr q0, [x17, #+976] -sqrdmulh v12.4S, v16.4S, v22.4S -mul v16.4S, v16.4S,v15.4S -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v16.4s -add v19.4s, v19.4s, v16.4s -ldr q16, [x17, #+992] -ldr q17, [x17, #+1008] -sqrdmulh v10.4S, v30.4S, v0.4S -mul v30.4S, v30.4S,v14.4S -mla v30.4S, v10.4S, v31.s[0] -sub v10.4s, v3.4s, v30.4s -add v3.4s, v3.4s, v30.4s -ldr q30, [x17, #+1984] -ldr q5, [x17, #+2000] -sqrdmulh v18.4S, v4.4S, v17.4S -mul v4.4S, v4.4S,v16.4S -mla v4.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -ldr q4, [x17, #+2016] -ldr q13, [x17, #+2032] -sqrdmulh v21.4S, v19.4S, v5.4S -ldr q9, [x0, #480] -mul v19.4S, v19.4S,v30.4S -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v13.4S -ldr q29, [x0, #496] -mul v12.4S, v12.4S,v4.4S -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -str q3, [x0, #384] -str q10, [x0, #400] -str q2, [x0, #416] -str q18, [x0, #432] -str q8, [x0, #896] -str q21, [x0, #912] -str q6, [x0, #928] -str q19, [x0, #944] -ldr q13, [x17, #+1024] -ldr q4, [x17, #+1040] -ldr q5, [x0, #448] -ldr q30, [x0, #464] -ldr q22, [x17, #+2048] -ldr q15, [x17, #+2064] -sqrdmulh v28.4S, v9.4S, v4.s[0] -ldr q7, [x0, #992] -mul v9.4S, v9.4S,v13.s[0] -ldr q19, [x0, #1008] -mla v9.4S, v28.4S, v31.s[0] -sub v28.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -sqrdmulh v9.4S, v29.4S, v4.s[0] -ldr q6, [x0, #960] -mul v29.4S, v29.4S,v13.s[0] -ldr q21, [x0, #976] -mla v29.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v7.4S, v15.s[0] -mul v7.4S, v7.4S,v22.s[0] -mla v7.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v19.4S, v15.s[0] -mul v19.4S, v19.4S,v22.s[0] -mla v19.4S, v7.4S, v31.s[0] -sub v7.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v4.s[1] -mul v30.4S, v30.4S,v13.s[1] -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v9.4S, v4.s[2] -mul v9.4S, v9.4S,v13.s[2] -mla v9.4S, v30.4S, v31.s[0] -sub v30.4s, v28.4s, v9.4s -add v28.4s, v28.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v15.s[1] -mul v21.4S, v21.4S,v22.s[1] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v21.4s -trn1 v8.4S, v5.4S, v19.4S -trn2 v17.4S, v5.4S, v19.4S -add v6.4s, v6.4s, v21.4s -trn1 v21.4S, v28.4S, v30.4S -trn2 v16.4S, v28.4S, v30.4S -sqrdmulh v0.4S, v7.4S, v15.s[2] -ldr q14, [x17, #+1056] -mul v7.4S, v7.4S,v22.s[2] -ldr q1, [x17, #+1072] -mla v7.4S, v0.4S, v31.s[0] -trn2 v28.2D, v8.2D, v21.2D -trn2 v30.2D, v17.2D, v16.2D -sub v0.4s, v29.4s, v7.4s -trn1 v5.2D, v8.2D, v21.2D -trn1 v19.2D, v17.2D, v16.2D -add v29.4s, v29.4s, v7.4s -sqrdmulh v7.4S, v28.4S, v1.4S -mul v28.4S, v28.4S,v14.4S -mla v28.4S, v7.4S, v31.s[0] -trn1 v7.4S, v6.4S, v9.4S -trn2 v16.4S, v6.4S, v9.4S -sub v17.4s, v5.4s, v28.4s -trn1 v21.4S, v29.4S, v0.4S -trn2 v8.4S, v29.4S, v0.4S -add v5.4s, v5.4s, v28.4s -trn2 v29.2D, v7.2D, v21.2D -trn2 v0.2D, v16.2D, v8.2D -sqrdmulh v28.4S, v30.4S, v1.4S -trn1 v6.2D, v7.2D, v21.2D -trn1 v9.2D, v16.2D, v8.2D -ldr q8, [x17, #+2080] -ldr q16, [x17, #+2096] -mul v30.4S, v30.4S,v14.4S -mla v30.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v16.4S -mul v29.4S, v29.4S,v8.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v29.4s -add v6.4s, v6.4s, v29.4s -ldr q29, [x17, #+1088] -ldr q21, [x17, #+1104] -sqrdmulh v7.4S, v0.4S, v16.4S -mul v0.4S, v0.4S,v8.4S -mla v0.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -ldr q0, [x17, #+1120] -ldr q20, [x17, #+1136] -sqrdmulh v11.4S, v19.4S, v21.4S -mul v19.4S, v19.4S,v29.4S -mla v19.4S, v11.4S, v31.s[0] -sub v11.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -ldr q19, [x17, #+2112] -ldr q27, [x17, #+2128] -sqrdmulh v18.4S, v28.4S, v20.4S -mul v28.4S, v28.4S,v0.4S -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v28.4s -add v17.4s, v17.4s, v28.4s -ldr q28, [x17, #+2144] -ldr q2, [x17, #+2160] -sqrdmulh v10.4S, v9.4S, v27.4S -mul v9.4S, v9.4S,v19.4S -mla v9.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v2.4S -mul v7.4S, v7.4S,v28.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v7.4s -add v30.4s, v30.4s, v7.4s -str q5, [x0, #448] -str q11, [x0, #464] -str q17, [x0, #480] -str q18, [x0, #496] -str q6, [x0, #960] -str q10, [x0, #976] -str q30, [x0, #992] -str q9, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s deleted file mode 100644 index 64b3010..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z2_5.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z2_5 -.global _ntt_u32_full_neon_asm_var_4_4_3_z2_5 -ntt_u32_full_neon_asm_var_4_4_3_z2_5: -_ntt_u32_full_neon_asm_var_4_4_3_z2_5: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x17, #+128] -ldr q7, [x17, #+144] -ldr q15, [x0, #0] -ldr q10, [x0, #16] -ldr q2, [x17, #+1152] -ldr q16, [x17, #+1168] -sqrdmulh v22.4S, v4.4S, v7.s[0] -ldr q13, [x0, #544] -mul v4.4S, v4.4S,v6.s[0] -ldr q11, [x0, #560] -sqrdmulh v21.4S, v5.4S, v7.s[0] -ldr q14, [x0, #512] -mul v5.4S, v5.4S,v6.s[0] -ldr q0, [x0, #528] -mla v4.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v13.4S, v16.s[0] -mul v13.4S, v13.4S,v2.s[0] -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v11.4S, v16.s[0] -mul v11.4S, v11.4S,v2.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v7.s[1] -mul v10.4S, v10.4S,v6.s[1] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v7.s[2] -mul v22.4S, v22.4S,v6.s[2] -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v0.4S, v16.s[1] -mul v0.4S, v0.4S,v2.s[1] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v16.s[2] -mul v5.4S, v5.4S,v2.s[2] -mla v0.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -trn1 v22.4S, v15.4S, v13.4S -trn2 v19.4S, v15.4S, v13.4S -trn1 v17.4S, v21.4S, v11.4S -trn2 v20.4S, v21.4S, v11.4S -trn2 v21.2D, v22.2D, v17.2D -trn2 v11.2D, v19.2D, v20.2D -trn1 v15.2D, v22.2D, v17.2D -trn1 v13.2D, v19.2D, v20.2D -ldr q20, [x17, #+160] -ldr q19, [x17, #+176] -sqrdmulh v17.4S, v21.4S, v19.4S -mul v21.4S, v21.4S,v20.4S -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v19.4S -mul v11.4S, v11.4S,v20.4S -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -trn1 v5.4S, v14.4S, v10.4S -trn2 v22.4S, v14.4S, v10.4S -trn1 v3.4S, v4.4S, v17.4S -trn2 v1.4S, v4.4S, v17.4S -trn2 v4.2D, v5.2D, v3.2D -trn2 v17.2D, v22.2D, v1.2D -trn1 v14.2D, v5.2D, v3.2D -trn1 v10.2D, v22.2D, v1.2D -ldr q1, [x17, #+1184] -ldr q22, [x17, #+1200] -sqrdmulh v3.4S, v4.4S, v22.4S -ldr q5, [x17, #+192] -ldr q9, [x17, #+208] -mul v4.4S, v4.4S,v1.4S -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v22.4S -ldr q12, [x17, #+224] -ldr q8, [x17, #+240] -mul v17.4S, v17.4S,v1.4S -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v9.4S -ldr q18, [x17, #+1216] -ldr q30, [x17, #+1232] -mul v13.4S, v13.4S,v5.4S -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v8.4S -ldr q29, [x17, #+1248] -ldr q28, [x17, #+1264] -mul v3.4S, v3.4S,v12.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v30.4S -ldr q27, [x0, #96] -mul v10.4S, v10.4S,v18.4S -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v13.4s -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v28.4S -ldr q26, [x0, #112] -mul v11.4S, v11.4S,v29.4S -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v10.4s -add v14.4s, v14.4s, v10.4s -sub v10.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -str q15, [x0, #0] -str q4, [x0, #16] -str q0, [x0, #32] -str q17, [x0, #48] -str q14, [x0, #512] -str q13, [x0, #528] -str q21, [x0, #544] -str q10, [x0, #560] -ldr q28, [x17, #+256] -ldr q29, [x17, #+272] -ldr q30, [x0, #64] -ldr q18, [x0, #80] -ldr q22, [x17, #+1280] -ldr q1, [x17, #+1296] -sqrdmulh v16.4S, v27.4S, v29.s[0] -ldr q2, [x0, #608] -mul v27.4S, v27.4S,v28.s[0] -ldr q10, [x0, #624] -sqrdmulh v21.4S, v26.4S, v29.s[0] -ldr q13, [x0, #576] -mul v26.4S, v26.4S,v28.s[0] -ldr q14, [x0, #592] -mla v27.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v2.4S, v1.s[0] -mul v2.4S, v2.4S,v22.s[0] -mla v26.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -sqrdmulh v27.4S, v10.4S, v1.s[0] -mul v10.4S, v10.4S,v22.s[0] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v28.s[1] -mla v10.4S, v27.4S, v31.s[0] -sub v27.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v28.s[2] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v10.4s -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v14.4S, v1.s[1] -mul v14.4S, v14.4S,v22.s[1] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v1.s[2] -mul v26.4S, v26.4S,v22.s[2] -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -trn1 v16.4S, v30.4S, v2.4S -trn2 v8.4S, v30.4S, v2.4S -trn1 v12.4S, v21.4S, v10.4S -trn2 v9.4S, v21.4S, v10.4S -trn2 v21.2D, v16.2D, v12.2D -trn2 v10.2D, v8.2D, v9.2D -trn1 v30.2D, v16.2D, v12.2D -trn1 v2.2D, v8.2D, v9.2D -ldr q9, [x17, #+288] -ldr q8, [x17, #+304] -sqrdmulh v12.4S, v21.4S, v8.4S -mul v21.4S, v21.4S,v9.4S -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v8.4S -mul v10.4S, v10.4S,v9.4S -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -trn1 v26.4S, v13.4S, v18.4S -trn2 v16.4S, v13.4S, v18.4S -trn1 v5.4S, v27.4S, v12.4S -trn2 v19.4S, v27.4S, v12.4S -trn2 v27.2D, v26.2D, v5.2D -trn2 v12.2D, v16.2D, v19.2D -trn1 v13.2D, v26.2D, v5.2D -trn1 v18.2D, v16.2D, v19.2D -ldr q19, [x17, #+1312] -ldr q16, [x17, #+1328] -sqrdmulh v5.4S, v27.4S, v16.4S -ldr q26, [x17, #+320] -ldr q20, [x17, #+336] -mul v27.4S, v27.4S,v19.4S -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v30.4s, v21.4s -add v30.4s, v30.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v16.4S -ldr q7, [x17, #+352] -ldr q6, [x17, #+368] -mul v12.4S, v12.4S,v19.4S -mla v27.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v20.4S -ldr q17, [x17, #+1344] -ldr q0, [x17, #+1360] -mul v2.4S, v2.4S,v26.4S -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v27.4s -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v5.4S, v6.4S -ldr q4, [x17, #+1376] -ldr q15, [x17, #+1392] -mul v5.4S, v5.4S,v7.4S -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v0.4S -ldr q11, [x0, #160] -mul v18.4S, v18.4S,v17.4S -mla v5.4S, v27.4S, v31.s[0] -sub v27.4s, v30.4s, v2.4s -add v30.4s, v30.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v15.4S -ldr q3, [x0, #176] -mul v10.4S, v10.4S,v4.4S -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v5.4s -add v14.4s, v14.4s, v5.4s -mla v10.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sub v18.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -str q30, [x0, #64] -str q27, [x0, #80] -str q14, [x0, #96] -str q12, [x0, #112] -str q13, [x0, #576] -str q2, [x0, #592] -str q21, [x0, #608] -str q18, [x0, #624] -ldr q15, [x17, #+384] -ldr q4, [x17, #+400] -ldr q0, [x0, #128] -ldr q17, [x0, #144] -ldr q16, [x17, #+1408] -ldr q19, [x17, #+1424] -sqrdmulh v1.4S, v11.4S, v4.s[0] -ldr q22, [x0, #672] -mul v11.4S, v11.4S,v15.s[0] -ldr q18, [x0, #688] -sqrdmulh v21.4S, v3.4S, v4.s[0] -ldr q2, [x0, #640] -mul v3.4S, v3.4S,v15.s[0] -ldr q13, [x0, #656] -mla v11.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v22.4S, v19.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v19.s[0] -mul v18.4S, v18.4S,v16.s[0] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v4.s[1] -mul v17.4S, v17.4S,v15.s[1] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v4.s[2] -mul v1.4S, v1.4S,v15.s[2] -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v19.s[1] -mul v13.4S, v13.4S,v16.s[1] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v19.s[2] -mul v3.4S, v3.4S,v16.s[2] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -trn1 v1.4S, v0.4S, v22.4S -trn2 v6.4S, v0.4S, v22.4S -trn1 v7.4S, v21.4S, v18.4S -trn2 v20.4S, v21.4S, v18.4S -trn2 v21.2D, v1.2D, v7.2D -trn2 v18.2D, v6.2D, v20.2D -trn1 v0.2D, v1.2D, v7.2D -trn1 v22.2D, v6.2D, v20.2D -ldr q20, [x17, #+416] -ldr q6, [x17, #+432] -sqrdmulh v7.4S, v21.4S, v6.4S -mul v21.4S, v21.4S,v20.4S -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v13.4s -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v6.4S -mul v18.4S, v18.4S,v20.4S -mla v21.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -trn1 v3.4S, v2.4S, v17.4S -trn2 v1.4S, v2.4S, v17.4S -trn1 v26.4S, v11.4S, v7.4S -trn2 v8.4S, v11.4S, v7.4S -trn2 v11.2D, v3.2D, v26.2D -trn2 v7.2D, v1.2D, v8.2D -trn1 v2.2D, v3.2D, v26.2D -trn1 v17.2D, v1.2D, v8.2D -ldr q8, [x17, #+1440] -ldr q1, [x17, #+1456] -sqrdmulh v26.4S, v11.4S, v1.4S -ldr q3, [x17, #+448] -ldr q9, [x17, #+464] -mul v11.4S, v11.4S,v8.4S -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v7.4S, v1.4S -ldr q29, [x17, #+480] -ldr q28, [x17, #+496] -mul v7.4S, v7.4S,v8.4S -mla v11.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v9.4S -ldr q12, [x17, #+1472] -ldr q14, [x17, #+1488] -mul v22.4S, v22.4S,v3.4S -mla v7.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v11.4s -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v26.4S, v28.4S -ldr q27, [x17, #+1504] -ldr q30, [x17, #+1520] -mul v26.4S, v26.4S,v29.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v7.4s -add v17.4s, v17.4s, v7.4s -sqrdmulh v7.4S, v17.4S, v14.4S -ldr q10, [x0, #224] -mul v17.4S, v17.4S,v12.4S -mla v26.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v30.4S -ldr q5, [x0, #240] -mul v18.4S, v18.4S,v27.4S -mla v17.4S, v7.4S, v31.s[0] -sub v7.4s, v13.4s, v26.4s -add v13.4s, v13.4s, v26.4s -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sub v17.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -str q0, [x0, #128] -str q11, [x0, #144] -str q13, [x0, #160] -str q7, [x0, #176] -str q2, [x0, #640] -str q22, [x0, #656] -str q21, [x0, #672] -str q17, [x0, #688] -ldr q30, [x17, #+512] -ldr q27, [x17, #+528] -ldr q14, [x0, #192] -ldr q12, [x0, #208] -ldr q1, [x17, #+1536] -ldr q8, [x17, #+1552] -sqrdmulh v19.4S, v10.4S, v27.s[0] -ldr q16, [x0, #736] -mul v10.4S, v10.4S,v30.s[0] -ldr q17, [x0, #752] -sqrdmulh v21.4S, v5.4S, v27.s[0] -ldr q22, [x0, #704] -mul v5.4S, v5.4S,v30.s[0] -ldr q2, [x0, #720] -mla v10.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v16.4S, v8.s[0] -mul v16.4S, v16.4S,v1.s[0] -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v10.4s -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v8.s[0] -mul v17.4S, v17.4S,v1.s[0] -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v5.4s -add v12.4s, v12.4s, v5.4s -sqrdmulh v5.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v30.s[1] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v30.s[2] -mla v12.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v2.4S, v8.s[1] -mul v2.4S, v2.4S,v1.s[1] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v5.4S, v8.s[2] -mul v5.4S, v5.4S,v1.s[2] -mla v2.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -trn1 v19.4S, v14.4S, v16.4S -trn2 v28.4S, v14.4S, v16.4S -trn1 v29.4S, v21.4S, v17.4S -trn2 v9.4S, v21.4S, v17.4S -trn2 v21.2D, v19.2D, v29.2D -trn2 v17.2D, v28.2D, v9.2D -trn1 v14.2D, v19.2D, v29.2D -trn1 v16.2D, v28.2D, v9.2D -ldr q9, [x17, #+544] -ldr q28, [x17, #+560] -sqrdmulh v29.4S, v21.4S, v28.4S -mul v21.4S, v21.4S,v9.4S -mla v5.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v17.4S, v28.4S -mul v17.4S, v17.4S,v9.4S -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -trn1 v5.4S, v22.4S, v12.4S -trn2 v19.4S, v22.4S, v12.4S -trn1 v3.4S, v10.4S, v29.4S -trn2 v6.4S, v10.4S, v29.4S -trn2 v10.2D, v5.2D, v3.2D -trn2 v29.2D, v19.2D, v6.2D -trn1 v22.2D, v5.2D, v3.2D -trn1 v12.2D, v19.2D, v6.2D -ldr q6, [x17, #+1568] -ldr q19, [x17, #+1584] -sqrdmulh v3.4S, v10.4S, v19.4S -ldr q5, [x17, #+576] -ldr q20, [x17, #+592] -mul v10.4S, v10.4S,v6.4S -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v29.4S, v19.4S -ldr q4, [x17, #+608] -ldr q15, [x17, #+624] -mul v29.4S, v29.4S,v6.4S -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v20.4S -ldr q7, [x17, #+1600] -ldr q13, [x17, #+1616] -mul v16.4S, v16.4S,v5.4S -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v15.4S -ldr q11, [x17, #+1632] -ldr q0, [x17, #+1648] -mul v3.4S, v3.4S,v4.4S -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v29.4s -add v12.4s, v12.4s, v29.4s -sqrdmulh v29.4S, v12.4S, v13.4S -ldr q18, [x0, #288] -mul v12.4S, v12.4S,v7.4S -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v16.4s -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v0.4S -ldr q26, [x0, #304] -mul v17.4S, v17.4S,v11.4S -mla v12.4S, v29.4S, v31.s[0] -sub v29.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sub v12.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -str q14, [x0, #192] -str q10, [x0, #208] -str q2, [x0, #224] -str q29, [x0, #240] -str q22, [x0, #704] -str q16, [x0, #720] -str q21, [x0, #736] -str q12, [x0, #752] -ldr q0, [x17, #+640] -ldr q11, [x17, #+656] -ldr q13, [x0, #256] -ldr q7, [x0, #272] -ldr q19, [x17, #+1664] -ldr q6, [x17, #+1680] -sqrdmulh v8.4S, v18.4S, v11.s[0] -ldr q1, [x0, #800] -mul v18.4S, v18.4S,v0.s[0] -ldr q12, [x0, #816] -sqrdmulh v21.4S, v26.4S, v11.s[0] -ldr q16, [x0, #768] -mul v26.4S, v26.4S,v0.s[0] -ldr q22, [x0, #784] -mla v18.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v1.4S, v6.s[0] -mul v1.4S, v1.4S,v19.s[0] -mla v26.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v6.s[0] -mul v12.4S, v12.4S,v19.s[0] -mla v1.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v26.4s -add v7.4s, v7.4s, v26.4s -sqrdmulh v26.4S, v7.4S, v11.s[1] -mul v7.4S, v7.4S,v0.s[1] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v11.s[2] -mul v8.4S, v8.4S,v0.s[2] -mla v7.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v6.s[1] -mul v22.4S, v22.4S,v19.s[1] -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v26.4S, v6.s[2] -mul v26.4S, v26.4S,v19.s[2] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -trn1 v8.4S, v13.4S, v1.4S -trn2 v15.4S, v13.4S, v1.4S -trn1 v4.4S, v21.4S, v12.4S -trn2 v20.4S, v21.4S, v12.4S -trn2 v21.2D, v8.2D, v4.2D -trn2 v12.2D, v15.2D, v20.2D -trn1 v13.2D, v8.2D, v4.2D -trn1 v1.2D, v15.2D, v20.2D -ldr q20, [x17, #+672] -ldr q15, [x17, #+688] -sqrdmulh v4.4S, v21.4S, v15.4S -mul v21.4S, v21.4S,v20.4S -mla v26.4S, v7.4S, v31.s[0] -sub v7.4s, v16.4s, v22.4s -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v15.4S -mul v12.4S, v12.4S,v20.4S -mla v21.4S, v4.4S, v31.s[0] -sub v4.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -trn1 v26.4S, v16.4S, v7.4S -trn2 v8.4S, v16.4S, v7.4S -trn1 v5.4S, v18.4S, v4.4S -trn2 v28.4S, v18.4S, v4.4S -trn2 v18.2D, v26.2D, v5.2D -trn2 v4.2D, v8.2D, v28.2D -trn1 v16.2D, v26.2D, v5.2D -trn1 v7.2D, v8.2D, v28.2D -ldr q28, [x17, #+1696] -ldr q8, [x17, #+1712] -sqrdmulh v5.4S, v18.4S, v8.4S -ldr q26, [x17, #+704] -ldr q9, [x17, #+720] -mul v18.4S, v18.4S,v28.4S -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v4.4S, v8.4S -ldr q27, [x17, #+736] -ldr q30, [x17, #+752] -mul v4.4S, v4.4S,v28.4S -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v12.4s -add v1.4s, v1.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v9.4S -ldr q29, [x17, #+1728] -ldr q2, [x17, #+1744] -mul v1.4S, v1.4S,v26.4S -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v18.4S, v5.4S, v30.4S -ldr q10, [x17, #+1760] -ldr q14, [x17, #+1776] -mul v5.4S, v5.4S,v27.4S -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v2.4S -ldr q17, [x0, #352] -mul v7.4S, v7.4S,v29.4S -mla v5.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v14.4S -ldr q3, [x0, #368] -mul v12.4S, v12.4S,v10.4S -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v22.4s, v5.4s -add v22.4s, v22.4s, v5.4s -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v16.4s, v7.4s -add v16.4s, v16.4s, v7.4s -sub v7.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -str q13, [x0, #256] -str q18, [x0, #272] -str q22, [x0, #288] -str q4, [x0, #304] -str q16, [x0, #768] -str q1, [x0, #784] -str q21, [x0, #800] -str q7, [x0, #816] -ldr q14, [x17, #+768] -ldr q10, [x17, #+784] -ldr q2, [x0, #320] -ldr q29, [x0, #336] -ldr q8, [x17, #+1792] -ldr q28, [x17, #+1808] -sqrdmulh v6.4S, v17.4S, v10.s[0] -ldr q19, [x0, #864] -mul v17.4S, v17.4S,v14.s[0] -ldr q7, [x0, #880] -sqrdmulh v21.4S, v3.4S, v10.s[0] -ldr q1, [x0, #832] -mul v3.4S, v3.4S,v14.s[0] -ldr q16, [x0, #848] -mla v17.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v19.4S, v28.s[0] -mul v19.4S, v19.4S,v8.s[0] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v28.s[0] -mul v7.4S, v7.4S,v8.s[0] -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v29.4S, v10.s[1] -mul v29.4S, v29.4S,v14.s[1] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v19.4s -add v1.4s, v1.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v10.s[2] -mul v6.4S, v6.4S,v14.s[2] -mla v29.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v7.4s -add v16.4s, v16.4s, v7.4s -sqrdmulh v7.4S, v16.4S, v28.s[1] -mul v16.4S, v16.4S,v8.s[1] -mla v6.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v29.4s -add v2.4s, v2.4s, v29.4s -sqrdmulh v29.4S, v3.4S, v28.s[2] -mul v3.4S, v3.4S,v8.s[2] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -trn1 v6.4S, v2.4S, v19.4S -trn2 v30.4S, v2.4S, v19.4S -trn1 v27.4S, v21.4S, v7.4S -trn2 v9.4S, v21.4S, v7.4S -trn2 v21.2D, v6.2D, v27.2D -trn2 v7.2D, v30.2D, v9.2D -trn1 v2.2D, v6.2D, v27.2D -trn1 v19.2D, v30.2D, v9.2D -ldr q9, [x17, #+800] -ldr q30, [x17, #+816] -sqrdmulh v27.4S, v21.4S, v30.4S -mul v21.4S, v21.4S,v9.4S -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v7.4S, v30.4S -mul v7.4S, v7.4S,v9.4S -mla v21.4S, v27.4S, v31.s[0] -sub v27.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -trn1 v3.4S, v1.4S, v29.4S -trn2 v6.4S, v1.4S, v29.4S -trn1 v26.4S, v17.4S, v27.4S -trn2 v15.4S, v17.4S, v27.4S -trn2 v17.2D, v3.2D, v26.2D -trn2 v27.2D, v6.2D, v15.2D -trn1 v1.2D, v3.2D, v26.2D -trn1 v29.2D, v6.2D, v15.2D -ldr q15, [x17, #+1824] -ldr q6, [x17, #+1840] -sqrdmulh v26.4S, v17.4S, v6.4S -ldr q3, [x17, #+832] -ldr q20, [x17, #+848] -mul v17.4S, v17.4S,v15.4S -mla v7.4S, v16.4S, v31.s[0] -sub v16.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v6.4S -ldr q11, [x17, #+864] -ldr q0, [x17, #+880] -mul v27.4S, v27.4S,v15.4S -mla v17.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v7.4s -add v19.4s, v19.4s, v7.4s -sqrdmulh v7.4S, v19.4S, v20.4S -ldr q4, [x17, #+1856] -ldr q22, [x17, #+1872] -mul v19.4S, v19.4S,v3.4S -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v26.4S, v0.4S -ldr q18, [x17, #+1888] -ldr q13, [x17, #+1904] -mul v26.4S, v26.4S,v11.4S -mla v19.4S, v7.4S, v31.s[0] -sub v7.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v29.4S, v22.4S -ldr q12, [x0, #416] -mul v29.4S, v29.4S,v4.4S -mla v26.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v7.4S, v13.4S -ldr q5, [x0, #432] -mul v7.4S, v7.4S,v18.4S -mla v29.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -mla v7.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v29.4s -add v1.4s, v1.4s, v29.4s -sub v29.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -str q2, [x0, #320] -str q17, [x0, #336] -str q16, [x0, #352] -str q27, [x0, #368] -str q1, [x0, #832] -str q19, [x0, #848] -str q21, [x0, #864] -str q29, [x0, #880] -ldr q13, [x17, #+896] -ldr q18, [x17, #+912] -ldr q22, [x0, #384] -ldr q4, [x0, #400] -ldr q6, [x17, #+1920] -ldr q15, [x17, #+1936] -sqrdmulh v28.4S, v12.4S, v18.s[0] -ldr q8, [x0, #928] -mul v12.4S, v12.4S,v13.s[0] -ldr q29, [x0, #944] -sqrdmulh v21.4S, v5.4S, v18.s[0] -ldr q19, [x0, #896] -mul v5.4S, v5.4S,v13.s[0] -ldr q1, [x0, #912] -mla v12.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v8.4S, v15.s[0] -mul v8.4S, v8.4S,v6.s[0] -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v29.4S, v15.s[0] -mul v29.4S, v29.4S,v6.s[0] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v18.s[1] -mul v4.4S, v4.4S,v13.s[1] -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v28.4S, v18.s[2] -mul v28.4S, v28.4S,v13.s[2] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v29.4s -add v1.4s, v1.4s, v29.4s -sqrdmulh v29.4S, v1.4S, v15.s[1] -mul v1.4S, v1.4S,v6.s[1] -mla v28.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v5.4S, v15.s[2] -mul v5.4S, v5.4S,v6.s[2] -mla v1.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -trn1 v28.4S, v22.4S, v8.4S -trn2 v0.4S, v22.4S, v8.4S -trn1 v11.4S, v21.4S, v29.4S -trn2 v20.4S, v21.4S, v29.4S -trn2 v21.2D, v28.2D, v11.2D -trn2 v29.2D, v0.2D, v20.2D -trn1 v22.2D, v28.2D, v11.2D -trn1 v8.2D, v0.2D, v20.2D -ldr q20, [x17, #+928] -ldr q0, [x17, #+944] -sqrdmulh v11.4S, v21.4S, v0.4S -mul v21.4S, v21.4S,v20.4S -mla v5.4S, v4.4S, v31.s[0] -sub v4.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v29.4S, v0.4S -mul v29.4S, v29.4S,v20.4S -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v12.4s, v5.4s -add v12.4s, v12.4s, v5.4s -trn1 v5.4S, v19.4S, v4.4S -trn2 v28.4S, v19.4S, v4.4S -trn1 v3.4S, v12.4S, v11.4S -trn2 v30.4S, v12.4S, v11.4S -trn2 v12.2D, v5.2D, v3.2D -trn2 v11.2D, v28.2D, v30.2D -trn1 v19.2D, v5.2D, v3.2D -trn1 v4.2D, v28.2D, v30.2D -ldr q30, [x17, #+1952] -ldr q28, [x17, #+1968] -sqrdmulh v3.4S, v12.4S, v28.4S -ldr q5, [x17, #+960] -ldr q9, [x17, #+976] -mul v12.4S, v12.4S,v30.4S -mla v29.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v28.4S -ldr q10, [x17, #+992] -ldr q14, [x17, #+1008] -mul v11.4S, v11.4S,v30.4S -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -sqrdmulh v29.4S, v8.4S, v9.4S -ldr q27, [x17, #+1984] -ldr q16, [x17, #+2000] -mul v8.4S, v8.4S,v5.4S -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v12.4s -add v19.4s, v19.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v14.4S -ldr q17, [x17, #+2016] -ldr q2, [x17, #+2032] -mul v3.4S, v3.4S,v10.4S -mla v8.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v4.4S, v16.4S -ldr q7, [x0, #480] -mul v4.4S, v4.4S,v27.4S -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v8.4s -add v22.4s, v22.4s, v8.4s -sqrdmulh v8.4S, v29.4S, v2.4S -ldr q26, [x0, #496] -mul v29.4S, v29.4S,v17.4S -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v3.4s -add v1.4s, v1.4s, v3.4s -mla v29.4S, v8.4S, v31.s[0] -sub v8.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -sub v4.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -str q22, [x0, #384] -str q12, [x0, #400] -str q1, [x0, #416] -str q11, [x0, #432] -str q19, [x0, #896] -str q8, [x0, #912] -str q21, [x0, #928] -str q4, [x0, #944] -ldr q2, [x17, #+1024] -ldr q17, [x17, #+1040] -ldr q16, [x0, #448] -ldr q27, [x0, #464] -ldr q28, [x17, #+2048] -ldr q30, [x17, #+2064] -sqrdmulh v15.4S, v7.4S, v17.s[0] -ldr q6, [x0, #992] -mul v7.4S, v7.4S,v2.s[0] -ldr q4, [x0, #1008] -sqrdmulh v21.4S, v26.4S, v17.s[0] -ldr q8, [x0, #960] -mul v26.4S, v26.4S,v2.s[0] -ldr q19, [x0, #976] -mla v7.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v6.4S, v30.s[0] -mul v6.4S, v6.4S,v28.s[0] -mla v26.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v7.4s -add v16.4s, v16.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v30.s[0] -mul v4.4S, v4.4S,v28.s[0] -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v17.s[1] -mul v27.4S, v27.4S,v2.s[1] -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v15.4S, v17.s[2] -mul v15.4S, v15.4S,v2.s[2] -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -sqrdmulh v4.4S, v19.4S, v30.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sqrdmulh v27.4S, v26.4S, v30.s[2] -mul v26.4S, v26.4S,v28.s[2] -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -trn1 v15.4S, v16.4S, v6.4S -trn2 v14.4S, v16.4S, v6.4S -trn1 v10.4S, v21.4S, v4.4S -trn2 v9.4S, v21.4S, v4.4S -trn2 v21.2D, v15.2D, v10.2D -trn2 v4.2D, v14.2D, v9.2D -trn1 v16.2D, v15.2D, v10.2D -trn1 v6.2D, v14.2D, v9.2D -ldr q9, [x17, #+1056] -ldr q14, [x17, #+1072] -sqrdmulh v10.4S, v21.4S, v14.4S -mul v21.4S, v21.4S,v9.4S -mla v26.4S, v27.4S, v31.s[0] -sub v27.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v19.4S, v4.4S, v14.4S -mul v4.4S, v4.4S,v9.4S -mla v21.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v26.4s -add v7.4s, v7.4s, v26.4s -trn1 v26.4S, v8.4S, v27.4S -trn2 v15.4S, v8.4S, v27.4S -trn1 v5.4S, v7.4S, v10.4S -trn2 v0.4S, v7.4S, v10.4S -trn2 v7.2D, v26.2D, v5.2D -trn2 v10.2D, v15.2D, v0.2D -trn1 v8.2D, v26.2D, v5.2D -trn1 v27.2D, v15.2D, v0.2D -ldr q0, [x17, #+2080] -ldr q15, [x17, #+2096] -sqrdmulh v5.4S, v7.4S, v15.4S -ldr q26, [x17, #+1088] -ldr q20, [x17, #+1104] -mul v7.4S, v7.4S,v0.4S -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v15.4S -ldr q18, [x17, #+1120] -ldr q13, [x17, #+1136] -mul v10.4S, v10.4S,v0.4S -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v20.4S -ldr q11, [x17, #+2112] -ldr q1, [x17, #+2128] -mul v6.4S, v6.4S,v26.4S -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v7.4s -add v8.4s, v8.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v13.4S -ldr q12, [x17, #+2144] -ldr q22, [x17, #+2160] -mul v5.4S, v5.4S,v18.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v27.4s, v10.4s -add v27.4s, v27.4s, v10.4s -sqrdmulh v10.4S, v27.4S, v1.4S -mul v27.4S, v27.4S,v11.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v16.4s, v6.4s -add v16.4s, v16.4s, v6.4s -sqrdmulh v6.4S, v4.4S, v22.4S -mul v4.4S, v4.4S,v12.4S -mla v27.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -mla v4.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v27.4s -add v8.4s, v8.4s, v27.4s -sub v27.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -str q16, [x0, #448] -str q7, [x0, #464] -str q19, [x0, #480] -str q10, [x0, #496] -str q8, [x0, #960] -str q6, [x0, #976] -str q21, [x0, #992] -str q27, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s deleted file mode 100644 index 7675702..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z4_0 -.global _ntt_u32_full_neon_asm_var_4_4_3_z4_0 -ntt_u32_full_neon_asm_var_4_4_3_z4_0: -_ntt_u32_full_neon_asm_var_4_4_3_z4_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #16] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v13.4S -trn2 v0.4S, v11.4S, v13.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v17.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v21.2D, v0.2D, v17.2D -trn1 v11.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q11, [x0, #0] -str q21, [x0, #16] -str q17, [x0, #32] -str q13, [x0, #48] -ldr q16, [x17, #+256] -ldr q2, [x17, #+272] -ldr q10, [x17, #+288] -ldr q15, [x17, #+304] -ldr q7, [x17, #+320] -ldr q6, [x17, #+336] -ldr q5, [x17, #+352] -ldr q4, [x17, #+368] -ldr q13, [x0, #96] -ldr q17, [x0, #112] -ldr q21, [x0, #64] -ldr q11, [x0, #80] -sqrdmulh v14.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v2.s[2] -mul v13.4S, v13.4S,v16.s[2] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v21.4S, v17.4S -trn2 v0.4S, v21.4S, v17.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v22.4S, v14.4S, v11.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v11.2D, v0.2D, v22.2D -trn1 v21.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q21, [x0, #64] -str q11, [x0, #80] -str q22, [x0, #96] -str q17, [x0, #112] -ldr q4, [x17, #+384] -ldr q5, [x17, #+400] -ldr q6, [x17, #+416] -ldr q7, [x17, #+432] -ldr q15, [x17, #+448] -ldr q10, [x17, #+464] -ldr q2, [x17, #+480] -ldr q16, [x17, #+496] -ldr q17, [x0, #160] -ldr q22, [x0, #176] -ldr q11, [x0, #128] -ldr q21, [x0, #144] -sqrdmulh v14.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v11.4S, v22.4S -trn2 v0.4S, v11.4S, v22.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v13.4S, v14.4S, v21.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v21.2D, v0.2D, v13.2D -trn1 v11.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v15.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q11, [x0, #128] -str q21, [x0, #144] -str q13, [x0, #160] -str q22, [x0, #176] -ldr q16, [x17, #+512] -ldr q2, [x17, #+528] -ldr q10, [x17, #+544] -ldr q15, [x17, #+560] -ldr q7, [x17, #+576] -ldr q6, [x17, #+592] -ldr q5, [x17, #+608] -ldr q4, [x17, #+624] -ldr q22, [x0, #224] -ldr q13, [x0, #240] -ldr q21, [x0, #192] -ldr q11, [x0, #208] -sqrdmulh v14.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v16.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v21.4S, v13.4S -trn2 v0.4S, v21.4S, v13.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v17.4S, v14.4S, v11.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v11.2D, v0.2D, v17.2D -trn1 v21.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v6.4S -mul v13.4S, v13.4S,v7.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q21, [x0, #192] -str q11, [x0, #208] -str q17, [x0, #224] -str q13, [x0, #240] -ldr q4, [x17, #+640] -ldr q5, [x17, #+656] -ldr q6, [x17, #+672] -ldr q7, [x17, #+688] -ldr q15, [x17, #+704] -ldr q10, [x17, #+720] -ldr q2, [x17, #+736] -ldr q16, [x17, #+752] -ldr q13, [x0, #288] -ldr q17, [x0, #304] -ldr q11, [x0, #256] -ldr q21, [x0, #272] -sqrdmulh v14.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v11.4S, v17.4S -trn2 v0.4S, v11.4S, v17.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v22.4S, v14.4S, v21.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v21.2D, v0.2D, v22.2D -trn1 v11.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v15.4S -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q11, [x0, #256] -str q21, [x0, #272] -str q22, [x0, #288] -str q17, [x0, #304] -ldr q16, [x17, #+768] -ldr q2, [x17, #+784] -ldr q10, [x17, #+800] -ldr q15, [x17, #+816] -ldr q7, [x17, #+832] -ldr q6, [x17, #+848] -ldr q5, [x17, #+864] -ldr q4, [x17, #+880] -ldr q17, [x0, #352] -ldr q22, [x0, #368] -ldr q21, [x0, #320] -ldr q11, [x0, #336] -sqrdmulh v14.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v2.s[2] -mul v17.4S, v17.4S,v16.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v21.4S, v22.4S -trn2 v0.4S, v21.4S, v22.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v13.4S, v14.4S, v11.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v11.2D, v0.2D, v13.2D -trn1 v21.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q21, [x0, #320] -str q11, [x0, #336] -str q13, [x0, #352] -str q22, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q6, [x17, #+928] -ldr q7, [x17, #+944] -ldr q15, [x17, #+960] -ldr q10, [x17, #+976] -ldr q2, [x17, #+992] -ldr q16, [x17, #+1008] -ldr q22, [x0, #416] -ldr q13, [x0, #432] -ldr q11, [x0, #384] -ldr q21, [x0, #400] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v13.4S -trn2 v0.4S, v11.4S, v13.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v17.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v21.2D, v0.2D, v17.2D -trn1 v11.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q11, [x0, #384] -str q21, [x0, #400] -str q17, [x0, #416] -str q13, [x0, #432] -ldr q16, [x17, #+1024] -ldr q2, [x17, #+1040] -ldr q10, [x17, #+1056] -ldr q15, [x17, #+1072] -ldr q7, [x17, #+1088] -ldr q6, [x17, #+1104] -ldr q5, [x17, #+1120] -ldr q4, [x17, #+1136] -ldr q13, [x0, #480] -ldr q17, [x0, #496] -ldr q21, [x0, #448] -ldr q11, [x0, #464] -sqrdmulh v14.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v2.s[2] -mul v13.4S, v13.4S,v16.s[2] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v21.4S, v17.4S -trn2 v0.4S, v21.4S, v17.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v22.4S, v14.4S, v11.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v11.2D, v0.2D, v22.2D -trn1 v21.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q21, [x0, #448] -str q11, [x0, #464] -str q22, [x0, #480] -str q17, [x0, #496] -ldr q4, [x17, #+1152] -ldr q5, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q7, [x17, #+1200] -ldr q15, [x17, #+1216] -ldr q10, [x17, #+1232] -ldr q2, [x17, #+1248] -ldr q16, [x17, #+1264] -ldr q17, [x0, #544] -ldr q22, [x0, #560] -ldr q11, [x0, #512] -ldr q21, [x0, #528] -sqrdmulh v14.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v11.4S, v22.4S -trn2 v0.4S, v11.4S, v22.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v13.4S, v14.4S, v21.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v21.2D, v0.2D, v13.2D -trn1 v11.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v15.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q11, [x0, #512] -str q21, [x0, #528] -str q13, [x0, #544] -str q22, [x0, #560] -ldr q16, [x17, #+1280] -ldr q2, [x17, #+1296] -ldr q10, [x17, #+1312] -ldr q15, [x17, #+1328] -ldr q7, [x17, #+1344] -ldr q6, [x17, #+1360] -ldr q5, [x17, #+1376] -ldr q4, [x17, #+1392] -ldr q22, [x0, #608] -ldr q13, [x0, #624] -ldr q21, [x0, #576] -ldr q11, [x0, #592] -sqrdmulh v14.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v16.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v21.4S, v13.4S -trn2 v0.4S, v21.4S, v13.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v17.4S, v14.4S, v11.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v11.2D, v0.2D, v17.2D -trn1 v21.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v6.4S -mul v13.4S, v13.4S,v7.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q21, [x0, #576] -str q11, [x0, #592] -str q17, [x0, #608] -str q13, [x0, #624] -ldr q4, [x17, #+1408] -ldr q5, [x17, #+1424] -ldr q6, [x17, #+1440] -ldr q7, [x17, #+1456] -ldr q15, [x17, #+1472] -ldr q10, [x17, #+1488] -ldr q2, [x17, #+1504] -ldr q16, [x17, #+1520] -ldr q13, [x0, #672] -ldr q17, [x0, #688] -ldr q11, [x0, #640] -ldr q21, [x0, #656] -sqrdmulh v14.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v11.4S, v17.4S -trn2 v0.4S, v11.4S, v17.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v22.4S, v14.4S, v21.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v21.2D, v0.2D, v22.2D -trn1 v11.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v15.4S -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q11, [x0, #640] -str q21, [x0, #656] -str q22, [x0, #672] -str q17, [x0, #688] -ldr q16, [x17, #+1536] -ldr q2, [x17, #+1552] -ldr q10, [x17, #+1568] -ldr q15, [x17, #+1584] -ldr q7, [x17, #+1600] -ldr q6, [x17, #+1616] -ldr q5, [x17, #+1632] -ldr q4, [x17, #+1648] -ldr q17, [x0, #736] -ldr q22, [x0, #752] -ldr q21, [x0, #704] -ldr q11, [x0, #720] -sqrdmulh v14.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v2.s[2] -mul v17.4S, v17.4S,v16.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v21.4S, v22.4S -trn2 v0.4S, v21.4S, v22.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v13.4S, v14.4S, v11.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v11.2D, v0.2D, v13.2D -trn1 v21.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q21, [x0, #704] -str q11, [x0, #720] -str q13, [x0, #736] -str q22, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q6, [x17, #+1696] -ldr q7, [x17, #+1712] -ldr q15, [x17, #+1728] -ldr q10, [x17, #+1744] -ldr q2, [x17, #+1760] -ldr q16, [x17, #+1776] -ldr q22, [x0, #800] -ldr q13, [x0, #816] -ldr q11, [x0, #768] -ldr q21, [x0, #784] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v11.4S, v13.4S -trn2 v0.4S, v11.4S, v13.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v17.4S, v14.4S, v21.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v21.2D, v0.2D, v17.2D -trn1 v11.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q11, [x0, #768] -str q21, [x0, #784] -str q17, [x0, #800] -str q13, [x0, #816] -ldr q16, [x17, #+1792] -ldr q2, [x17, #+1808] -ldr q10, [x17, #+1824] -ldr q15, [x17, #+1840] -ldr q7, [x17, #+1856] -ldr q6, [x17, #+1872] -ldr q5, [x17, #+1888] -ldr q4, [x17, #+1904] -ldr q13, [x0, #864] -ldr q17, [x0, #880] -ldr q21, [x0, #832] -ldr q11, [x0, #848] -sqrdmulh v14.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v2.s[2] -mul v13.4S, v13.4S,v16.s[2] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -trn1 v13.4S, v21.4S, v17.4S -trn2 v0.4S, v21.4S, v17.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v22.4S, v14.4S, v11.4S -trn2 v14.2D, v13.2D, v19.2D -trn2 v11.2D, v0.2D, v22.2D -trn1 v21.2D, v13.2D, v19.2D -trn1 v17.2D, v0.2D, v22.2D -sqrdmulh v22.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v6.4S -mul v17.4S, v17.4S,v7.4S -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -str q21, [x0, #832] -str q11, [x0, #848] -str q22, [x0, #864] -str q17, [x0, #880] -ldr q4, [x17, #+1920] -ldr q5, [x17, #+1936] -ldr q6, [x17, #+1952] -ldr q7, [x17, #+1968] -ldr q15, [x17, #+1984] -ldr q10, [x17, #+2000] -ldr q2, [x17, #+2016] -ldr q16, [x17, #+2032] -ldr q17, [x0, #928] -ldr q22, [x0, #944] -ldr q11, [x0, #896] -ldr q21, [x0, #912] -sqrdmulh v14.4S, v17.4S, v5.s[0] -mul v17.4S, v17.4S,v4.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v5.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v17.4s -add v14.4s, v14.4s, v17.4s -trn1 v17.4S, v11.4S, v22.4S -trn2 v0.4S, v11.4S, v22.4S -trn1 v19.4S, v14.4S, v21.4S -trn2 v13.4S, v14.4S, v21.4S -trn2 v14.2D, v17.2D, v19.2D -trn2 v21.2D, v0.2D, v13.2D -trn1 v11.2D, v17.2D, v19.2D -trn1 v22.2D, v0.2D, v13.2D -sqrdmulh v13.4S, v14.4S, v7.4S -mul v14.4S, v14.4S,v6.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v10.4S -mul v22.4S, v22.4S,v15.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v2.4S -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -str q11, [x0, #896] -str q21, [x0, #912] -str q13, [x0, #928] -str q22, [x0, #944] -ldr q16, [x17, #+2048] -ldr q2, [x17, #+2064] -ldr q10, [x17, #+2080] -ldr q15, [x17, #+2096] -ldr q7, [x17, #+2112] -ldr q6, [x17, #+2128] -ldr q5, [x17, #+2144] -ldr q4, [x17, #+2160] -ldr q22, [x0, #992] -ldr q13, [x0, #1008] -ldr q21, [x0, #960] -ldr q11, [x0, #976] -sqrdmulh v14.4S, v22.4S, v2.s[0] -mul v22.4S, v22.4S,v16.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v2.s[0] -mul v13.4S, v13.4S,v16.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v2.s[2] -mul v22.4S, v22.4S,v16.s[2] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -trn1 v22.4S, v21.4S, v13.4S -trn2 v0.4S, v21.4S, v13.4S -trn1 v19.4S, v14.4S, v11.4S -trn2 v17.4S, v14.4S, v11.4S -trn2 v14.2D, v22.2D, v19.2D -trn2 v11.2D, v0.2D, v17.2D -trn1 v21.2D, v22.2D, v19.2D -trn1 v13.2D, v0.2D, v17.2D -sqrdmulh v17.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v10.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v6.4S -mul v13.4S, v13.4S,v7.4S -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v4.4S -mul v14.4S, v14.4S,v5.4S -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q21, [x0, #960] -str q11, [x0, #976] -str q17, [x0, #992] -str q13, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s deleted file mode 100644 index 02d26b2..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_1.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z4_1 -.global _ntt_u32_full_neon_asm_var_4_4_3_z4_1 -ntt_u32_full_neon_asm_var_4_4_3_z4_1: -_ntt_u32_full_neon_asm_var_4_4_3_z4_1: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x0, #0] -ldr q7, [x0, #16] -ldr q15, [x0, #96] -ldr q10, [x0, #112] -ldr q2, [x0, #64] -ldr q16, [x0, #80] -ldr q22, [x0, #160] -ldr q13, [x0, #176] -ldr q11, [x0, #128] -ldr q21, [x0, #144] -ldr q14, [x0, #224] -ldr q0, [x0, #240] -ldr q19, [x0, #192] -ldr q17, [x0, #208] -ldr q20, [x17, #+128] -ldr q3, [x17, #+144] -ldr q1, [x17, #+256] -ldr q9, [x17, #+272] -ldr q12, [x17, #+384] -ldr q8, [x17, #+400] -ldr q18, [x17, #+512] -ldr q30, [x17, #+528] -sqrdmulh v29.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v20.s[0] -sqrdmulh v28.4S, v5.4S, v3.s[0] -mul v5.4S, v5.4S,v20.s[0] -mla v4.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v15.4S, v9.s[0] -mul v15.4S, v15.4S,v1.s[0] -mla v5.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v9.s[0] -mul v10.4S, v10.4S,v1.s[0] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v3.s[1] -mul v7.4S, v7.4S,v20.s[1] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v29.4S, v3.s[2] -mul v29.4S, v29.4S,v20.s[2] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v1.s[1] -mla v29.4S, v15.4S, v31.s[0] -sub v15.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v3.4S, v5.4S, v9.s[2] -mul v5.4S, v5.4S,v1.s[2] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v8.s[0] -mul v22.4S, v22.4S,v12.s[0] -mla v5.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v9.4S, v13.4S, v8.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v5.4S, v14.4S, v30.s[0] -mul v14.4S, v14.4S,v18.s[0] -mla v13.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v0.4S, v30.s[0] -mul v0.4S, v0.4S,v18.s[0] -mla v14.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v8.s[1] -mul v21.4S, v21.4S,v12.s[1] -mla v0.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v5.4S, v8.s[2] -mul v5.4S, v5.4S,v12.s[2] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v30.s[1] -mul v17.4S, v17.4S,v18.s[1] -mla v5.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v8.4S, v13.4S, v30.s[2] -mul v13.4S, v13.4S,v18.s[2] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sub v30.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -trn1 v13.4S, v6.4S, v15.4S -trn2 v18.4S, v6.4S, v15.4S -trn1 v17.4S, v28.4S, v10.4S -trn2 v5.4S, v28.4S, v10.4S -trn2 v28.2D, v13.2D, v17.2D -trn2 v10.2D, v18.2D, v5.2D -trn1 v6.2D, v13.2D, v17.2D -trn1 v15.2D, v18.2D, v5.2D -trn1 v5.4S, v2.4S, v3.4S -trn2 v18.4S, v2.4S, v3.4S -trn1 v17.4S, v4.4S, v29.4S -trn2 v13.4S, v4.4S, v29.4S -trn2 v4.2D, v5.2D, v17.2D -trn2 v29.2D, v18.2D, v13.2D -trn1 v2.2D, v5.2D, v17.2D -trn1 v3.2D, v18.2D, v13.2D -trn1 v13.4S, v11.4S, v14.4S -trn2 v18.4S, v11.4S, v14.4S -trn1 v17.4S, v9.4S, v0.4S -trn2 v5.4S, v9.4S, v0.4S -trn2 v9.2D, v13.2D, v17.2D -trn2 v0.2D, v18.2D, v5.2D -trn1 v11.2D, v13.2D, v17.2D -trn1 v14.2D, v18.2D, v5.2D -trn1 v5.4S, v19.4S, v8.4S -trn2 v18.4S, v19.4S, v8.4S -trn1 v17.4S, v22.4S, v30.4S -trn2 v13.4S, v22.4S, v30.4S -trn2 v22.2D, v5.2D, v17.2D -trn2 v30.2D, v18.2D, v13.2D -trn1 v19.2D, v5.2D, v17.2D -trn1 v8.2D, v18.2D, v13.2D -ldr q13, [x17, #+160] -ldr q18, [x17, #+176] -sqrdmulh v17.4S, v28.4S, v18.4S -mul v28.4S, v28.4S,v13.4S -sqrdmulh v5.4S, v10.4S, v18.4S -mul v10.4S, v10.4S,v13.4S -mla v28.4S, v17.4S, v31.s[0] -ldr q17, [x17, #+288] -ldr q18, [x17, #+304] -sqrdmulh v13.4S, v4.4S, v18.4S -mul v4.4S, v4.4S,v17.4S -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v18.4S -mul v29.4S, v29.4S,v17.4S -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -ldr q10, [x17, #+192] -ldr q18, [x17, #+208] -sqrdmulh v17.4S, v15.4S, v18.4S -mul v15.4S, v15.4S,v10.4S -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -ldr q4, [x17, #+224] -ldr q18, [x17, #+240] -sqrdmulh v10.4S, v13.4S, v18.4S -mul v13.4S, v13.4S,v4.4S -mla v15.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v29.4s -add v3.4s, v3.4s, v29.4s -ldr q29, [x17, #+320] -ldr q18, [x17, #+336] -sqrdmulh v4.4S, v3.4S, v18.4S -mul v3.4S, v3.4S,v29.4S -mla v13.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v15.4s -add v6.4s, v6.4s, v15.4s -ldr q15, [x17, #+352] -ldr q18, [x17, #+368] -sqrdmulh v29.4S, v17.4S, v18.4S -mul v17.4S, v17.4S,v15.4S -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v13.4s -add v5.4s, v5.4s, v13.4s -mla v17.4S, v29.4S, v31.s[0] -sub v29.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -sub v3.4s, v28.4s, v17.4s -add v28.4s, v28.4s, v17.4s -str q6, [x0, #0] -str q10, [x0, #16] -str q5, [x0, #32] -str q4, [x0, #48] -str q2, [x0, #64] -str q29, [x0, #80] -str q28, [x0, #96] -str q3, [x0, #112] -ldr q3, [x17, #+416] -ldr q28, [x17, #+432] -sqrdmulh v29.4S, v9.4S, v28.4S -mul v9.4S, v9.4S,v3.4S -sqrdmulh v2.4S, v0.4S, v28.4S -mul v0.4S, v0.4S,v3.4S -mla v9.4S, v29.4S, v31.s[0] -ldr q29, [x17, #+544] -ldr q28, [x17, #+560] -sqrdmulh v3.4S, v22.4S, v28.4S -mul v22.4S, v22.4S,v29.4S -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v11.4s, v9.4s -add v11.4s, v11.4s, v9.4s -sqrdmulh v9.4S, v30.4S, v28.4S -mul v30.4S, v30.4S,v29.4S -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+448] -ldr q28, [x17, #+464] -sqrdmulh v29.4S, v14.4S, v28.4S -mul v14.4S, v14.4S,v0.4S -mla v30.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -ldr q22, [x17, #+480] -ldr q28, [x17, #+496] -sqrdmulh v0.4S, v3.4S, v28.4S -mul v3.4S, v3.4S,v22.4S -mla v14.4S, v29.4S, v31.s[0] -sub v29.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -ldr q30, [x17, #+576] -ldr q28, [x17, #+592] -sqrdmulh v22.4S, v8.4S, v28.4S -mul v8.4S, v8.4S,v30.4S -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -ldr q14, [x17, #+608] -ldr q28, [x17, #+624] -sqrdmulh v30.4S, v29.4S, v28.4S -mul v29.4S, v29.4S,v14.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -sub v8.4s, v9.4s, v29.4s -add v9.4s, v9.4s, v29.4s -str q11, [x0, #128] -str q0, [x0, #144] -str q2, [x0, #160] -str q22, [x0, #176] -str q19, [x0, #192] -str q30, [x0, #208] -str q9, [x0, #224] -str q8, [x0, #240] -ldr q8, [x0, #288] -ldr q9, [x0, #304] -ldr q30, [x0, #256] -ldr q19, [x0, #272] -ldr q22, [x0, #352] -ldr q2, [x0, #368] -ldr q0, [x0, #320] -ldr q11, [x0, #336] -ldr q29, [x0, #416] -ldr q3, [x0, #432] -ldr q28, [x0, #384] -ldr q14, [x0, #400] -ldr q4, [x0, #480] -ldr q5, [x0, #496] -ldr q10, [x0, #448] -ldr q6, [x0, #464] -ldr q17, [x17, #+640] -ldr q13, [x17, #+656] -ldr q18, [x17, #+768] -ldr q15, [x17, #+784] -ldr q12, [x17, #+896] -ldr q21, [x17, #+912] -ldr q1, [x17, #+1024] -ldr q16, [x17, #+1040] -sqrdmulh v20.4S, v8.4S, v13.s[0] -mul v8.4S, v8.4S,v17.s[0] -sqrdmulh v7.4S, v9.4S, v13.s[0] -mul v9.4S, v9.4S,v17.s[0] -mla v8.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v22.4S, v15.s[0] -mul v22.4S, v22.4S,v18.s[0] -mla v9.4S, v7.4S, v31.s[0] -sub v7.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v15.s[0] -mul v2.4S, v2.4S,v18.s[0] -mla v22.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v9.4s -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v19.4S, v13.s[1] -mul v19.4S, v19.4S,v17.s[1] -mla v2.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v13.s[2] -mul v20.4S, v20.4S,v17.s[2] -mla v19.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v11.4S, v15.s[1] -mul v11.4S, v11.4S,v18.s[1] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v19.4s -add v30.4s, v30.4s, v19.4s -sqrdmulh v13.4S, v9.4S, v15.s[2] -mul v9.4S, v9.4S,v18.s[2] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v7.4s, v20.4s -add v7.4s, v7.4s, v20.4s -sqrdmulh v20.4S, v29.4S, v21.s[0] -mul v29.4S, v29.4S,v12.s[0] -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v15.4S, v3.4S, v21.s[0] -mul v3.4S, v3.4S,v12.s[0] -mla v29.4S, v20.4S, v31.s[0] -sub v20.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v16.s[0] -mul v4.4S, v4.4S,v1.s[0] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v5.4S, v16.s[0] -mul v5.4S, v5.4S,v1.s[0] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v21.s[1] -mul v14.4S, v14.4S,v12.s[1] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v21.s[2] -mul v9.4S, v9.4S,v12.s[2] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v16.s[1] -mul v6.4S, v6.4S,v1.s[1] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v14.4s -add v28.4s, v28.4s, v14.4s -sqrdmulh v21.4S, v3.4S, v16.s[2] -mul v3.4S, v3.4S,v1.s[2] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sub v16.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -trn1 v3.4S, v30.4S, v22.4S -trn2 v1.4S, v30.4S, v22.4S -trn1 v6.4S, v7.4S, v2.4S -trn2 v9.4S, v7.4S, v2.4S -trn2 v7.2D, v3.2D, v6.2D -trn2 v2.2D, v1.2D, v9.2D -trn1 v30.2D, v3.2D, v6.2D -trn1 v22.2D, v1.2D, v9.2D -trn1 v9.4S, v0.4S, v13.4S -trn2 v1.4S, v0.4S, v13.4S -trn1 v6.4S, v8.4S, v20.4S -trn2 v3.4S, v8.4S, v20.4S -trn2 v8.2D, v9.2D, v6.2D -trn2 v20.2D, v1.2D, v3.2D -trn1 v0.2D, v9.2D, v6.2D -trn1 v13.2D, v1.2D, v3.2D -trn1 v3.4S, v28.4S, v4.4S -trn2 v1.4S, v28.4S, v4.4S -trn1 v6.4S, v15.4S, v5.4S -trn2 v9.4S, v15.4S, v5.4S -trn2 v15.2D, v3.2D, v6.2D -trn2 v5.2D, v1.2D, v9.2D -trn1 v28.2D, v3.2D, v6.2D -trn1 v4.2D, v1.2D, v9.2D -trn1 v9.4S, v10.4S, v21.4S -trn2 v1.4S, v10.4S, v21.4S -trn1 v6.4S, v29.4S, v16.4S -trn2 v3.4S, v29.4S, v16.4S -trn2 v29.2D, v9.2D, v6.2D -trn2 v16.2D, v1.2D, v3.2D -trn1 v10.2D, v9.2D, v6.2D -trn1 v21.2D, v1.2D, v3.2D -ldr q3, [x17, #+672] -ldr q1, [x17, #+688] -sqrdmulh v6.4S, v7.4S, v1.4S -mul v7.4S, v7.4S,v3.4S -sqrdmulh v9.4S, v2.4S, v1.4S -mul v2.4S, v2.4S,v3.4S -mla v7.4S, v6.4S, v31.s[0] -ldr q6, [x17, #+800] -ldr q1, [x17, #+816] -sqrdmulh v3.4S, v8.4S, v1.4S -mul v8.4S, v8.4S,v6.4S -mla v2.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v7.4s -add v30.4s, v30.4s, v7.4s -sqrdmulh v7.4S, v20.4S, v1.4S -mul v20.4S, v20.4S,v6.4S -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -ldr q2, [x17, #+704] -ldr q1, [x17, #+720] -sqrdmulh v6.4S, v22.4S, v1.4S -mul v22.4S, v22.4S,v2.4S -mla v20.4S, v7.4S, v31.s[0] -sub v7.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -ldr q8, [x17, #+736] -ldr q1, [x17, #+752] -sqrdmulh v2.4S, v3.4S, v1.4S -mul v3.4S, v3.4S,v8.4S -mla v22.4S, v6.4S, v31.s[0] -sub v6.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -ldr q20, [x17, #+832] -ldr q1, [x17, #+848] -sqrdmulh v8.4S, v13.4S, v1.4S -mul v13.4S, v13.4S,v20.4S -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -ldr q22, [x17, #+864] -ldr q1, [x17, #+880] -sqrdmulh v20.4S, v6.4S, v1.4S -mul v6.4S, v6.4S,v22.4S -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -sub v13.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -str q30, [x0, #256] -str q2, [x0, #272] -str q9, [x0, #288] -str q8, [x0, #304] -str q0, [x0, #320] -str q20, [x0, #336] -str q7, [x0, #352] -str q13, [x0, #368] -ldr q13, [x17, #+928] -ldr q7, [x17, #+944] -sqrdmulh v20.4S, v15.4S, v7.4S -mul v15.4S, v15.4S,v13.4S -sqrdmulh v0.4S, v5.4S, v7.4S -mul v5.4S, v5.4S,v13.4S -mla v15.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+1056] -ldr q7, [x17, #+1072] -sqrdmulh v13.4S, v29.4S, v7.4S -mul v29.4S, v29.4S,v20.4S -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v28.4s, v15.4s -add v28.4s, v28.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v7.4S -mul v16.4S, v16.4S,v20.4S -mla v29.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -ldr q5, [x17, #+960] -ldr q7, [x17, #+976] -sqrdmulh v20.4S, v4.4S, v7.4S -mul v4.4S, v4.4S,v5.4S -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v29.4s -add v10.4s, v10.4s, v29.4s -ldr q29, [x17, #+992] -ldr q7, [x17, #+1008] -sqrdmulh v5.4S, v13.4S, v7.4S -mul v13.4S, v13.4S,v29.4S -mla v4.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x17, #+1088] -ldr q7, [x17, #+1104] -sqrdmulh v29.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v16.4S -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -ldr q4, [x17, #+1120] -ldr q7, [x17, #+1136] -sqrdmulh v16.4S, v20.4S, v7.4S -mul v20.4S, v20.4S,v4.4S -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -mla v20.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sub v21.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -str q28, [x0, #384] -str q5, [x0, #400] -str q0, [x0, #416] -str q29, [x0, #432] -str q10, [x0, #448] -str q16, [x0, #464] -str q15, [x0, #480] -str q21, [x0, #496] -ldr q21, [x0, #544] -ldr q15, [x0, #560] -ldr q16, [x0, #512] -ldr q10, [x0, #528] -ldr q29, [x0, #608] -ldr q0, [x0, #624] -ldr q5, [x0, #576] -ldr q28, [x0, #592] -ldr q20, [x0, #672] -ldr q13, [x0, #688] -ldr q7, [x0, #640] -ldr q4, [x0, #656] -ldr q8, [x0, #736] -ldr q9, [x0, #752] -ldr q2, [x0, #704] -ldr q30, [x0, #720] -ldr q6, [x17, #+1152] -ldr q3, [x17, #+1168] -ldr q1, [x17, #+1280] -ldr q22, [x17, #+1296] -ldr q12, [x17, #+1408] -ldr q14, [x17, #+1424] -ldr q18, [x17, #+1536] -ldr q11, [x17, #+1552] -sqrdmulh v17.4S, v21.4S, v3.s[0] -mul v21.4S, v21.4S,v6.s[0] -sqrdmulh v19.4S, v15.4S, v3.s[0] -mul v15.4S, v15.4S,v6.s[0] -mla v21.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v29.4S, v22.s[0] -mul v29.4S, v29.4S,v1.s[0] -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v22.s[0] -mul v0.4S, v0.4S,v1.s[0] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v3.s[1] -mul v10.4S, v10.4S,v6.s[1] -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -sqrdmulh v29.4S, v17.4S, v3.s[2] -mul v17.4S, v17.4S,v6.s[2] -mla v10.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v0.4s -add v28.4s, v28.4s, v0.4s -sqrdmulh v0.4S, v28.4S, v22.s[1] -mul v28.4S, v28.4S,v1.s[1] -mla v17.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v3.4S, v15.4S, v22.s[2] -mul v15.4S, v15.4S,v1.s[2] -mla v28.4S, v0.4S, v31.s[0] -sub v0.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v14.s[0] -mul v20.4S, v20.4S,v12.s[0] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v28.4s -add v5.4s, v5.4s, v28.4s -sqrdmulh v22.4S, v13.4S, v14.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v20.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v11.s[0] -mul v8.4S, v8.4S,v18.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v20.4s -add v7.4s, v7.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v11.s[0] -mul v9.4S, v9.4S,v18.s[0] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v13.4s -add v4.4s, v4.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v14.s[1] -mul v4.4S, v4.4S,v12.s[1] -mla v9.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v8.4s -add v2.4s, v2.4s, v8.4s -sqrdmulh v8.4S, v15.4S, v14.s[2] -mul v15.4S, v15.4S,v12.s[2] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v9.4s -add v30.4s, v30.4s, v9.4s -sqrdmulh v9.4S, v30.4S, v11.s[1] -mul v30.4S, v30.4S,v18.s[1] -mla v15.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v14.4S, v13.4S, v11.s[2] -mul v13.4S, v13.4S,v18.s[2] -mla v30.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v30.4s -add v2.4s, v2.4s, v30.4s -sub v11.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -trn1 v13.4S, v16.4S, v29.4S -trn2 v18.4S, v16.4S, v29.4S -trn1 v30.4S, v19.4S, v0.4S -trn2 v15.4S, v19.4S, v0.4S -trn2 v19.2D, v13.2D, v30.2D -trn2 v0.2D, v18.2D, v15.2D -trn1 v16.2D, v13.2D, v30.2D -trn1 v29.2D, v18.2D, v15.2D -trn1 v15.4S, v5.4S, v3.4S -trn2 v18.4S, v5.4S, v3.4S -trn1 v30.4S, v21.4S, v17.4S -trn2 v13.4S, v21.4S, v17.4S -trn2 v21.2D, v15.2D, v30.2D -trn2 v17.2D, v18.2D, v13.2D -trn1 v5.2D, v15.2D, v30.2D -trn1 v3.2D, v18.2D, v13.2D -trn1 v13.4S, v7.4S, v8.4S -trn2 v18.4S, v7.4S, v8.4S -trn1 v30.4S, v22.4S, v9.4S -trn2 v15.4S, v22.4S, v9.4S -trn2 v22.2D, v13.2D, v30.2D -trn2 v9.2D, v18.2D, v15.2D -trn1 v7.2D, v13.2D, v30.2D -trn1 v8.2D, v18.2D, v15.2D -trn1 v15.4S, v2.4S, v14.4S -trn2 v18.4S, v2.4S, v14.4S -trn1 v30.4S, v20.4S, v11.4S -trn2 v13.4S, v20.4S, v11.4S -trn2 v20.2D, v15.2D, v30.2D -trn2 v11.2D, v18.2D, v13.2D -trn1 v2.2D, v15.2D, v30.2D -trn1 v14.2D, v18.2D, v13.2D -ldr q13, [x17, #+1184] -ldr q18, [x17, #+1200] -sqrdmulh v30.4S, v19.4S, v18.4S -mul v19.4S, v19.4S,v13.4S -sqrdmulh v15.4S, v0.4S, v18.4S -mul v0.4S, v0.4S,v13.4S -mla v19.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+1312] -ldr q18, [x17, #+1328] -sqrdmulh v13.4S, v21.4S, v18.4S -mul v21.4S, v21.4S,v30.4S -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v19.4s -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v18.4S -mul v17.4S, v17.4S,v30.4S -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -ldr q0, [x17, #+1216] -ldr q18, [x17, #+1232] -sqrdmulh v30.4S, v29.4S, v18.4S -mul v29.4S, v29.4S,v0.4S -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -ldr q21, [x17, #+1248] -ldr q18, [x17, #+1264] -sqrdmulh v0.4S, v13.4S, v18.4S -mul v13.4S, v13.4S,v21.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -ldr q17, [x17, #+1344] -ldr q18, [x17, #+1360] -sqrdmulh v21.4S, v3.4S, v18.4S -mul v3.4S, v3.4S,v17.4S -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -ldr q29, [x17, #+1376] -ldr q18, [x17, #+1392] -sqrdmulh v17.4S, v30.4S, v18.4S -mul v30.4S, v30.4S,v29.4S -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v13.4s -add v15.4s, v15.4s, v13.4s -mla v30.4S, v17.4S, v31.s[0] -sub v17.4s, v5.4s, v3.4s -add v5.4s, v5.4s, v3.4s -sub v3.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -str q16, [x0, #512] -str q0, [x0, #528] -str q15, [x0, #544] -str q21, [x0, #560] -str q5, [x0, #576] -str q17, [x0, #592] -str q19, [x0, #608] -str q3, [x0, #624] -ldr q3, [x17, #+1440] -ldr q19, [x17, #+1456] -sqrdmulh v17.4S, v22.4S, v19.4S -mul v22.4S, v22.4S,v3.4S -sqrdmulh v5.4S, v9.4S, v19.4S -mul v9.4S, v9.4S,v3.4S -mla v22.4S, v17.4S, v31.s[0] -ldr q17, [x17, #+1568] -ldr q19, [x17, #+1584] -sqrdmulh v3.4S, v20.4S, v19.4S -mul v20.4S, v20.4S,v17.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v22.4s -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v11.4S, v19.4S -mul v11.4S, v11.4S,v17.4S -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -ldr q9, [x17, #+1472] -ldr q19, [x17, #+1488] -sqrdmulh v17.4S, v8.4S, v19.4S -mul v8.4S, v8.4S,v9.4S -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v20.4s -add v2.4s, v2.4s, v20.4s -ldr q20, [x17, #+1504] -ldr q19, [x17, #+1520] -sqrdmulh v9.4S, v3.4S, v19.4S -mul v3.4S, v3.4S,v20.4S -mla v8.4S, v17.4S, v31.s[0] -sub v17.4s, v14.4s, v11.4s -add v14.4s, v14.4s, v11.4s -ldr q11, [x17, #+1600] -ldr q19, [x17, #+1616] -sqrdmulh v20.4S, v14.4S, v19.4S -mul v14.4S, v14.4S,v11.4S -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -ldr q8, [x17, #+1632] -ldr q19, [x17, #+1648] -sqrdmulh v11.4S, v17.4S, v19.4S -mul v17.4S, v17.4S,v8.4S -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v3.4s -add v5.4s, v5.4s, v3.4s -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -sub v14.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -str q7, [x0, #640] -str q9, [x0, #656] -str q5, [x0, #672] -str q20, [x0, #688] -str q2, [x0, #704] -str q11, [x0, #720] -str q22, [x0, #736] -str q14, [x0, #752] -ldr q14, [x0, #800] -ldr q22, [x0, #816] -ldr q11, [x0, #768] -ldr q2, [x0, #784] -ldr q20, [x0, #864] -ldr q5, [x0, #880] -ldr q9, [x0, #832] -ldr q7, [x0, #848] -ldr q17, [x0, #928] -ldr q3, [x0, #944] -ldr q19, [x0, #896] -ldr q8, [x0, #912] -ldr q21, [x0, #992] -ldr q15, [x0, #1008] -ldr q0, [x0, #960] -ldr q16, [x0, #976] -ldr q30, [x17, #+1664] -ldr q13, [x17, #+1680] -ldr q18, [x17, #+1792] -ldr q29, [x17, #+1808] -ldr q12, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q1, [x17, #+2048] -ldr q28, [x17, #+2064] -sqrdmulh v6.4S, v14.4S, v13.s[0] -mul v14.4S, v14.4S,v30.s[0] -sqrdmulh v10.4S, v22.4S, v13.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v14.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v18.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v18.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v13.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v5.4S, v14.4S, v31.s[0] -sub v14.4s, v9.4s, v20.4s -add v9.4s, v9.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v13.s[2] -mul v6.4S, v6.4S,v30.s[2] -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v29.s[1] -mul v7.4S, v7.4S,v18.s[1] -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v13.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v18.s[2] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v17.4S, v4.s[0] -mul v17.4S, v17.4S,v12.s[0] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v29.4S, v3.4S, v4.s[0] -mul v3.4S, v3.4S,v12.s[0] -mla v17.4S, v6.4S, v31.s[0] -sub v6.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v28.s[0] -mul v21.4S, v21.4S,v1.s[0] -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v15.4S, v28.s[0] -mul v15.4S, v15.4S,v1.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v8.4S, v4.s[1] -mul v8.4S, v8.4S,v12.s[1] -mla v15.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v4.s[2] -mul v22.4S, v22.4S,v12.s[2] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v15.4s -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v28.s[1] -mul v16.4S, v16.4S,v1.s[1] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -sqrdmulh v4.4S, v3.4S, v28.s[2] -mul v3.4S, v3.4S,v1.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -sub v28.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -trn1 v3.4S, v11.4S, v20.4S -trn2 v1.4S, v11.4S, v20.4S -trn1 v16.4S, v10.4S, v5.4S -trn2 v22.4S, v10.4S, v5.4S -trn2 v10.2D, v3.2D, v16.2D -trn2 v5.2D, v1.2D, v22.2D -trn1 v11.2D, v3.2D, v16.2D -trn1 v20.2D, v1.2D, v22.2D -trn1 v22.4S, v9.4S, v13.4S -trn2 v1.4S, v9.4S, v13.4S -trn1 v16.4S, v14.4S, v6.4S -trn2 v3.4S, v14.4S, v6.4S -trn2 v14.2D, v22.2D, v16.2D -trn2 v6.2D, v1.2D, v3.2D -trn1 v9.2D, v22.2D, v16.2D -trn1 v13.2D, v1.2D, v3.2D -trn1 v3.4S, v19.4S, v21.4S -trn2 v1.4S, v19.4S, v21.4S -trn1 v16.4S, v29.4S, v15.4S -trn2 v22.4S, v29.4S, v15.4S -trn2 v29.2D, v3.2D, v16.2D -trn2 v15.2D, v1.2D, v22.2D -trn1 v19.2D, v3.2D, v16.2D -trn1 v21.2D, v1.2D, v22.2D -trn1 v22.4S, v0.4S, v4.4S -trn2 v1.4S, v0.4S, v4.4S -trn1 v16.4S, v17.4S, v28.4S -trn2 v3.4S, v17.4S, v28.4S -trn2 v17.2D, v22.2D, v16.2D -trn2 v28.2D, v1.2D, v3.2D -trn1 v0.2D, v22.2D, v16.2D -trn1 v4.2D, v1.2D, v3.2D -ldr q3, [x17, #+1696] -ldr q1, [x17, #+1712] -sqrdmulh v16.4S, v10.4S, v1.4S -mul v10.4S, v10.4S,v3.4S -sqrdmulh v22.4S, v5.4S, v1.4S -mul v5.4S, v5.4S,v3.4S -mla v10.4S, v16.4S, v31.s[0] -ldr q16, [x17, #+1824] -ldr q1, [x17, #+1840] -sqrdmulh v3.4S, v14.4S, v1.4S -mul v14.4S, v14.4S,v16.4S -mla v5.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v6.4S, v1.4S -mul v6.4S, v6.4S,v16.4S -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -ldr q5, [x17, #+1728] -ldr q1, [x17, #+1744] -sqrdmulh v16.4S, v20.4S, v1.4S -mul v20.4S, v20.4S,v5.4S -mla v6.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v14.4s -add v9.4s, v9.4s, v14.4s -ldr q14, [x17, #+1760] -ldr q1, [x17, #+1776] -sqrdmulh v5.4S, v3.4S, v1.4S -mul v3.4S, v3.4S,v14.4S -mla v20.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v6.4s -add v13.4s, v13.4s, v6.4s -ldr q6, [x17, #+1856] -ldr q1, [x17, #+1872] -sqrdmulh v14.4S, v13.4S, v1.4S -mul v13.4S, v13.4S,v6.4S -mla v3.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -ldr q20, [x17, #+1888] -ldr q1, [x17, #+1904] -sqrdmulh v6.4S, v16.4S, v1.4S -mul v16.4S, v16.4S,v20.4S -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -mla v16.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sub v13.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -str q11, [x0, #768] -str q5, [x0, #784] -str q22, [x0, #800] -str q14, [x0, #816] -str q9, [x0, #832] -str q6, [x0, #848] -str q10, [x0, #864] -str q13, [x0, #880] -ldr q13, [x17, #+1952] -ldr q10, [x17, #+1968] -sqrdmulh v6.4S, v29.4S, v10.4S -mul v29.4S, v29.4S,v13.4S -sqrdmulh v9.4S, v15.4S, v10.4S -mul v15.4S, v15.4S,v13.4S -mla v29.4S, v6.4S, v31.s[0] -ldr q6, [x17, #+2080] -ldr q10, [x17, #+2096] -sqrdmulh v13.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v6.4S -mla v15.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -sqrdmulh v29.4S, v28.4S, v10.4S -mul v28.4S, v28.4S,v6.4S -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -ldr q15, [x17, #+1984] -ldr q10, [x17, #+2000] -sqrdmulh v6.4S, v21.4S, v10.4S -mul v21.4S, v21.4S,v15.4S -mla v28.4S, v29.4S, v31.s[0] -sub v29.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -ldr q17, [x17, #+2016] -ldr q10, [x17, #+2032] -sqrdmulh v15.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v17.4S -mla v21.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -ldr q28, [x17, #+2112] -ldr q10, [x17, #+2128] -sqrdmulh v17.4S, v4.4S, v10.4S -mul v4.4S, v4.4S,v28.4S -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -ldr q21, [x17, #+2144] -ldr q10, [x17, #+2160] -sqrdmulh v28.4S, v6.4S, v10.4S -mul v6.4S, v6.4S,v21.4S -mla v4.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -mla v6.4S, v28.4S, v31.s[0] -sub v28.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sub v4.4s, v29.4s, v6.4s -add v29.4s, v29.4s, v6.4s -str q19, [x0, #896] -str q15, [x0, #912] -str q9, [x0, #928] -str q17, [x0, #944] -str q0, [x0, #960] -str q28, [x0, #976] -str q29, [x0, #992] -str q4, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s deleted file mode 100644 index 302342b..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_2.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z4_2 -.global _ntt_u32_full_neon_asm_var_4_4_3_z4_2 -ntt_u32_full_neon_asm_var_4_4_3_z4_2: -_ntt_u32_full_neon_asm_var_4_4_3_z4_2: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x0, #0] -ldr q7, [x0, #16] -ldr q15, [x0, #96] -ldr q10, [x0, #112] -ldr q2, [x0, #64] -ldr q16, [x0, #80] -ldr q22, [x0, #160] -ldr q13, [x0, #176] -ldr q11, [x0, #128] -ldr q21, [x0, #144] -ldr q14, [x0, #224] -ldr q0, [x0, #240] -ldr q19, [x0, #192] -ldr q17, [x0, #208] -ldr q20, [x17, #+128] -ldr q3, [x17, #+144] -ldr q1, [x17, #+256] -ldr q9, [x17, #+272] -ldr q12, [x17, #+384] -ldr q8, [x17, #+400] -ldr q18, [x17, #+512] -ldr q30, [x17, #+528] -sqrdmulh v29.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v20.s[0] -sqrdmulh v28.4S, v5.4S, v3.s[0] -mul v5.4S, v5.4S,v20.s[0] -mla v4.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v15.4S, v9.s[0] -mul v15.4S, v15.4S,v1.s[0] -mla v5.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v9.s[0] -mul v10.4S, v10.4S,v1.s[0] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v3.s[1] -mul v7.4S, v7.4S,v20.s[1] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v29.4S, v3.s[2] -mul v29.4S, v29.4S,v20.s[2] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v1.s[1] -mla v29.4S, v15.4S, v31.s[0] -sub v15.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v3.4S, v5.4S, v9.s[2] -mul v5.4S, v5.4S,v1.s[2] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v8.s[0] -mul v22.4S, v22.4S,v12.s[0] -trn1 v20.4S, v6.4S, v15.4S -trn2 v7.4S, v6.4S, v15.4S -mla v5.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v9.4S, v13.4S, v8.s[0] -mul v13.4S, v13.4S,v12.s[0] -trn1 v1.4S, v28.4S, v10.4S -trn2 v16.4S, v28.4S, v10.4S -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v5.4S, v14.4S, v30.s[0] -mul v14.4S, v14.4S,v18.s[0] -trn2 v28.2D, v20.2D, v1.2D -trn2 v10.2D, v7.2D, v16.2D -mla v13.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v0.4S, v30.s[0] -mul v0.4S, v0.4S,v18.s[0] -trn1 v6.2D, v20.2D, v1.2D -trn1 v15.2D, v7.2D, v16.2D -mla v14.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v8.s[1] -mul v21.4S, v21.4S,v12.s[1] -trn1 v16.4S, v2.4S, v3.4S -trn2 v7.4S, v2.4S, v3.4S -mla v0.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v5.4S, v8.s[2] -mul v5.4S, v5.4S,v12.s[2] -trn1 v1.4S, v4.4S, v29.4S -trn2 v20.4S, v4.4S, v29.4S -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v30.s[1] -mul v17.4S, v17.4S,v18.s[1] -trn2 v4.2D, v16.2D, v1.2D -trn2 v29.2D, v7.2D, v20.2D -mla v5.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v8.4S, v13.4S, v30.s[2] -mul v13.4S, v13.4S,v18.s[2] -trn1 v2.2D, v16.2D, v1.2D -trn1 v3.2D, v7.2D, v20.2D -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sub v30.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -ldr q13, [x17, #+160] -ldr q18, [x17, #+176] -sqrdmulh v17.4S, v28.4S, v18.4S -mul v28.4S, v28.4S,v13.4S -trn1 v5.4S, v11.4S, v14.4S -trn2 v20.4S, v11.4S, v14.4S -sqrdmulh v7.4S, v10.4S, v18.4S -mul v10.4S, v10.4S,v13.4S -trn1 v1.4S, v9.4S, v0.4S -trn2 v16.4S, v9.4S, v0.4S -mla v28.4S, v17.4S, v31.s[0] -ldr q17, [x17, #+288] -ldr q18, [x17, #+304] -sqrdmulh v13.4S, v4.4S, v18.4S -mul v4.4S, v4.4S,v17.4S -trn2 v9.2D, v5.2D, v1.2D -trn2 v0.2D, v20.2D, v16.2D -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v18.4S -mul v29.4S, v29.4S,v17.4S -trn1 v11.2D, v5.2D, v1.2D -trn1 v14.2D, v20.2D, v16.2D -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -ldr q10, [x17, #+192] -ldr q16, [x17, #+208] -sqrdmulh v20.4S, v15.4S, v16.4S -mul v15.4S, v15.4S,v10.4S -trn1 v16.4S, v19.4S, v8.4S -trn2 v10.4S, v19.4S, v8.4S -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -ldr q4, [x17, #+224] -ldr q1, [x17, #+240] -sqrdmulh v5.4S, v13.4S, v1.4S -mul v13.4S, v13.4S,v4.4S -trn1 v1.4S, v22.4S, v30.4S -trn2 v4.4S, v22.4S, v30.4S -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v29.4s -add v3.4s, v3.4s, v29.4s -ldr q29, [x17, #+320] -ldr q18, [x17, #+336] -sqrdmulh v17.4S, v3.4S, v18.4S -mul v3.4S, v3.4S,v29.4S -trn2 v22.2D, v16.2D, v1.2D -trn2 v30.2D, v10.2D, v4.2D -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v15.4s -add v6.4s, v6.4s, v15.4s -ldr q15, [x17, #+352] -ldr q18, [x17, #+368] -sqrdmulh v29.4S, v20.4S, v18.4S -mul v20.4S, v20.4S,v15.4S -trn1 v19.2D, v16.2D, v1.2D -trn1 v8.2D, v10.2D, v4.2D -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v7.4s, v13.4s -add v7.4s, v7.4s, v13.4s -mla v20.4S, v29.4S, v31.s[0] -sub v29.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -sub v3.4s, v28.4s, v20.4s -add v28.4s, v28.4s, v20.4s -str q6, [x0, #0] -str q5, [x0, #16] -str q7, [x0, #32] -str q17, [x0, #48] -str q2, [x0, #64] -str q29, [x0, #80] -str q28, [x0, #96] -str q3, [x0, #112] -ldr q3, [x17, #+416] -ldr q28, [x17, #+432] -sqrdmulh v29.4S, v9.4S, v28.4S -mul v9.4S, v9.4S,v3.4S -sqrdmulh v2.4S, v0.4S, v28.4S -mul v0.4S, v0.4S,v3.4S -mla v9.4S, v29.4S, v31.s[0] -ldr q29, [x17, #+544] -ldr q28, [x17, #+560] -sqrdmulh v3.4S, v22.4S, v28.4S -mul v22.4S, v22.4S,v29.4S -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v11.4s, v9.4s -add v11.4s, v11.4s, v9.4s -sqrdmulh v9.4S, v30.4S, v28.4S -mul v30.4S, v30.4S,v29.4S -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+448] -ldr q28, [x17, #+464] -sqrdmulh v29.4S, v14.4S, v28.4S -mul v14.4S, v14.4S,v0.4S -mla v30.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -ldr q22, [x17, #+480] -ldr q28, [x17, #+496] -sqrdmulh v0.4S, v3.4S, v28.4S -mul v3.4S, v3.4S,v22.4S -mla v14.4S, v29.4S, v31.s[0] -sub v29.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -ldr q30, [x17, #+576] -ldr q28, [x17, #+592] -sqrdmulh v22.4S, v8.4S, v28.4S -mul v8.4S, v8.4S,v30.4S -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -ldr q14, [x17, #+608] -ldr q28, [x17, #+624] -sqrdmulh v30.4S, v29.4S, v28.4S -mul v29.4S, v29.4S,v14.4S -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -sub v8.4s, v9.4s, v29.4s -add v9.4s, v9.4s, v29.4s -str q11, [x0, #128] -str q0, [x0, #144] -str q2, [x0, #160] -str q22, [x0, #176] -str q19, [x0, #192] -str q30, [x0, #208] -str q9, [x0, #224] -str q8, [x0, #240] -ldr q8, [x0, #288] -ldr q9, [x0, #304] -ldr q30, [x0, #256] -ldr q19, [x0, #272] -ldr q22, [x0, #352] -ldr q2, [x0, #368] -ldr q0, [x0, #320] -ldr q11, [x0, #336] -ldr q29, [x0, #416] -ldr q3, [x0, #432] -ldr q28, [x0, #384] -ldr q14, [x0, #400] -ldr q17, [x0, #480] -ldr q7, [x0, #496] -ldr q5, [x0, #448] -ldr q6, [x0, #464] -ldr q20, [x17, #+640] -ldr q13, [x17, #+656] -ldr q4, [x17, #+768] -ldr q10, [x17, #+784] -ldr q1, [x17, #+896] -ldr q16, [x17, #+912] -ldr q18, [x17, #+1024] -ldr q15, [x17, #+1040] -sqrdmulh v12.4S, v8.4S, v13.s[0] -mul v8.4S, v8.4S,v20.s[0] -sqrdmulh v21.4S, v9.4S, v13.s[0] -mul v9.4S, v9.4S,v20.s[0] -mla v8.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v22.4S, v10.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v8.4s -add v30.4s, v30.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v10.s[0] -mul v2.4S, v2.4S,v4.s[0] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v9.4s -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v19.4S, v13.s[1] -mul v19.4S, v19.4S,v20.s[1] -mla v2.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v22.4s -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v13.s[2] -mul v12.4S, v12.4S,v20.s[2] -mla v19.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v11.4S, v10.s[1] -mul v11.4S, v11.4S,v4.s[1] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v19.4s -add v30.4s, v30.4s, v19.4s -sqrdmulh v13.4S, v9.4S, v10.s[2] -mul v9.4S, v9.4S,v4.s[2] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v29.4S, v16.s[0] -mul v29.4S, v29.4S,v1.s[0] -trn1 v20.4S, v30.4S, v22.4S -trn2 v19.4S, v30.4S, v22.4S -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v10.4S, v3.4S, v16.s[0] -mul v3.4S, v3.4S,v1.s[0] -trn1 v4.4S, v21.4S, v2.4S -trn2 v11.4S, v21.4S, v2.4S -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v15.s[0] -mul v17.4S, v17.4S,v18.s[0] -trn2 v21.2D, v20.2D, v4.2D -trn2 v2.2D, v19.2D, v11.2D -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v7.4S, v15.s[0] -mul v7.4S, v7.4S,v18.s[0] -trn1 v30.2D, v20.2D, v4.2D -trn1 v22.2D, v19.2D, v11.2D -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v16.s[1] -mul v14.4S, v14.4S,v1.s[1] -trn1 v11.4S, v0.4S, v13.4S -trn2 v19.4S, v0.4S, v13.4S -mla v7.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v17.4s -add v5.4s, v5.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v16.s[2] -mul v9.4S, v9.4S,v1.s[2] -trn1 v4.4S, v8.4S, v12.4S -trn2 v20.4S, v8.4S, v12.4S -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v6.4S, v15.s[1] -mul v6.4S, v6.4S,v18.s[1] -trn2 v8.2D, v11.2D, v4.2D -trn2 v12.2D, v19.2D, v20.2D -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v28.4s, v14.4s -add v28.4s, v28.4s, v14.4s -sqrdmulh v16.4S, v3.4S, v15.s[2] -mul v3.4S, v3.4S,v18.s[2] -trn1 v0.2D, v11.2D, v4.2D -trn1 v13.2D, v19.2D, v20.2D -mla v6.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sub v15.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -ldr q3, [x17, #+672] -ldr q18, [x17, #+688] -sqrdmulh v6.4S, v21.4S, v18.4S -mul v21.4S, v21.4S,v3.4S -trn1 v9.4S, v28.4S, v17.4S -trn2 v20.4S, v28.4S, v17.4S -sqrdmulh v19.4S, v2.4S, v18.4S -mul v2.4S, v2.4S,v3.4S -trn1 v4.4S, v10.4S, v7.4S -trn2 v11.4S, v10.4S, v7.4S -mla v21.4S, v6.4S, v31.s[0] -ldr q6, [x17, #+800] -ldr q18, [x17, #+816] -sqrdmulh v3.4S, v8.4S, v18.4S -mul v8.4S, v8.4S,v6.4S -trn2 v10.2D, v9.2D, v4.2D -trn2 v7.2D, v20.2D, v11.2D -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v21.4s -add v30.4s, v30.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v18.4S -mul v12.4S, v12.4S,v6.4S -trn1 v28.2D, v9.2D, v4.2D -trn1 v17.2D, v20.2D, v11.2D -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -ldr q2, [x17, #+704] -ldr q11, [x17, #+720] -sqrdmulh v20.4S, v22.4S, v11.4S -mul v22.4S, v22.4S,v2.4S -trn1 v11.4S, v5.4S, v16.4S -trn2 v2.4S, v5.4S, v16.4S -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -ldr q8, [x17, #+736] -ldr q4, [x17, #+752] -sqrdmulh v9.4S, v3.4S, v4.4S -mul v3.4S, v3.4S,v8.4S -trn1 v4.4S, v29.4S, v15.4S -trn2 v8.4S, v29.4S, v15.4S -mla v22.4S, v20.4S, v31.s[0] -sub v20.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -ldr q12, [x17, #+832] -ldr q18, [x17, #+848] -sqrdmulh v6.4S, v13.4S, v18.4S -mul v13.4S, v13.4S,v12.4S -trn2 v29.2D, v11.2D, v4.2D -trn2 v15.2D, v2.2D, v8.2D -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -ldr q22, [x17, #+864] -ldr q18, [x17, #+880] -sqrdmulh v12.4S, v20.4S, v18.4S -mul v20.4S, v20.4S,v22.4S -trn1 v5.2D, v11.2D, v4.2D -trn1 v16.2D, v2.2D, v8.2D -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -str q30, [x0, #256] -str q9, [x0, #272] -str q19, [x0, #288] -str q6, [x0, #304] -str q0, [x0, #320] -str q12, [x0, #336] -str q21, [x0, #352] -str q13, [x0, #368] -ldr q13, [x17, #+928] -ldr q21, [x17, #+944] -sqrdmulh v12.4S, v10.4S, v21.4S -mul v10.4S, v10.4S,v13.4S -sqrdmulh v0.4S, v7.4S, v21.4S -mul v7.4S, v7.4S,v13.4S -mla v10.4S, v12.4S, v31.s[0] -ldr q12, [x17, #+1056] -ldr q21, [x17, #+1072] -sqrdmulh v13.4S, v29.4S, v21.4S -mul v29.4S, v29.4S,v12.4S -mla v7.4S, v0.4S, v31.s[0] -sub v0.4s, v28.4s, v10.4s -add v28.4s, v28.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v21.4S -mul v15.4S, v15.4S,v12.4S -mla v29.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v7.4s -add v17.4s, v17.4s, v7.4s -ldr q7, [x17, #+960] -ldr q21, [x17, #+976] -sqrdmulh v12.4S, v17.4S, v21.4S -mul v17.4S, v17.4S,v7.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -ldr q29, [x17, #+992] -ldr q21, [x17, #+1008] -sqrdmulh v7.4S, v13.4S, v21.4S -mul v13.4S, v13.4S,v29.4S -mla v17.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v15.4s -add v16.4s, v16.4s, v15.4s -ldr q15, [x17, #+1088] -ldr q21, [x17, #+1104] -sqrdmulh v29.4S, v16.4S, v21.4S -mul v16.4S, v16.4S,v15.4S -mla v13.4S, v7.4S, v31.s[0] -sub v7.4s, v28.4s, v17.4s -add v28.4s, v28.4s, v17.4s -ldr q17, [x17, #+1120] -ldr q21, [x17, #+1136] -sqrdmulh v15.4S, v12.4S, v21.4S -mul v12.4S, v12.4S,v17.4S -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sub v16.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -str q28, [x0, #384] -str q7, [x0, #400] -str q0, [x0, #416] -str q29, [x0, #432] -str q5, [x0, #448] -str q15, [x0, #464] -str q10, [x0, #480] -str q16, [x0, #496] -ldr q16, [x0, #544] -ldr q10, [x0, #560] -ldr q15, [x0, #512] -ldr q5, [x0, #528] -ldr q29, [x0, #608] -ldr q0, [x0, #624] -ldr q7, [x0, #576] -ldr q28, [x0, #592] -ldr q12, [x0, #672] -ldr q13, [x0, #688] -ldr q21, [x0, #640] -ldr q17, [x0, #656] -ldr q6, [x0, #736] -ldr q19, [x0, #752] -ldr q9, [x0, #704] -ldr q30, [x0, #720] -ldr q20, [x17, #+1152] -ldr q3, [x17, #+1168] -ldr q8, [x17, #+1280] -ldr q2, [x17, #+1296] -ldr q4, [x17, #+1408] -ldr q11, [x17, #+1424] -ldr q18, [x17, #+1536] -ldr q22, [x17, #+1552] -sqrdmulh v1.4S, v16.4S, v3.s[0] -mul v16.4S, v16.4S,v20.s[0] -sqrdmulh v14.4S, v10.4S, v3.s[0] -mul v10.4S, v10.4S,v20.s[0] -mla v16.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v29.4S, v2.s[0] -mul v29.4S, v29.4S,v8.s[0] -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v2.s[0] -mul v0.4S, v0.4S,v8.s[0] -mla v29.4S, v1.4S, v31.s[0] -sub v1.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v3.s[1] -mul v5.4S, v5.4S,v20.s[1] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v29.4s -add v7.4s, v7.4s, v29.4s -sqrdmulh v29.4S, v1.4S, v3.s[2] -mul v1.4S, v1.4S,v20.s[2] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v0.4s -add v28.4s, v28.4s, v0.4s -sqrdmulh v0.4S, v28.4S, v2.s[1] -mul v28.4S, v28.4S,v8.s[1] -mla v1.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v3.4S, v10.4S, v2.s[2] -mul v10.4S, v10.4S,v8.s[2] -mla v28.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v1.4s -add v14.4s, v14.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v11.s[0] -mul v12.4S, v12.4S,v4.s[0] -trn1 v20.4S, v15.4S, v29.4S -trn2 v5.4S, v15.4S, v29.4S -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v7.4s, v28.4s -add v7.4s, v7.4s, v28.4s -sqrdmulh v2.4S, v13.4S, v11.s[0] -mul v13.4S, v13.4S,v4.s[0] -trn1 v8.4S, v14.4S, v0.4S -trn2 v28.4S, v14.4S, v0.4S -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v6.4S, v22.s[0] -mul v6.4S, v6.4S,v18.s[0] -trn2 v14.2D, v20.2D, v8.2D -trn2 v0.2D, v5.2D, v28.2D -mla v13.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v18.s[0] -trn1 v15.2D, v20.2D, v8.2D -trn1 v29.2D, v5.2D, v28.2D -mla v6.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v11.s[1] -mul v17.4S, v17.4S,v4.s[1] -trn1 v28.4S, v7.4S, v3.4S -trn2 v5.4S, v7.4S, v3.4S -mla v19.4S, v12.4S, v31.s[0] -sub v12.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v11.s[2] -mul v10.4S, v10.4S,v4.s[2] -trn1 v8.4S, v16.4S, v1.4S -trn2 v20.4S, v16.4S, v1.4S -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v19.4s -add v30.4s, v30.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v22.s[1] -mul v30.4S, v30.4S,v18.s[1] -trn2 v16.2D, v28.2D, v8.2D -trn2 v1.2D, v5.2D, v20.2D -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v11.4S, v13.4S, v22.s[2] -mul v13.4S, v13.4S,v18.s[2] -trn1 v7.2D, v28.2D, v8.2D -trn1 v3.2D, v5.2D, v20.2D -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v30.4s -add v9.4s, v9.4s, v30.4s -sub v22.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -ldr q13, [x17, #+1184] -ldr q18, [x17, #+1200] -sqrdmulh v30.4S, v14.4S, v18.4S -mul v14.4S, v14.4S,v13.4S -trn1 v10.4S, v21.4S, v6.4S -trn2 v20.4S, v21.4S, v6.4S -sqrdmulh v5.4S, v0.4S, v18.4S -mul v0.4S, v0.4S,v13.4S -trn1 v8.4S, v2.4S, v19.4S -trn2 v28.4S, v2.4S, v19.4S -mla v14.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+1312] -ldr q18, [x17, #+1328] -sqrdmulh v13.4S, v16.4S, v18.4S -mul v16.4S, v16.4S,v30.4S -trn2 v2.2D, v10.2D, v8.2D -trn2 v19.2D, v20.2D, v28.2D -mla v0.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v1.4S, v18.4S -mul v1.4S, v1.4S,v30.4S -trn1 v21.2D, v10.2D, v8.2D -trn1 v6.2D, v20.2D, v28.2D -mla v16.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v0.4s -add v29.4s, v29.4s, v0.4s -ldr q0, [x17, #+1216] -ldr q28, [x17, #+1232] -sqrdmulh v20.4S, v29.4S, v28.4S -mul v29.4S, v29.4S,v0.4S -trn1 v28.4S, v9.4S, v11.4S -trn2 v0.4S, v9.4S, v11.4S -mla v1.4S, v14.4S, v31.s[0] -sub v14.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -ldr q16, [x17, #+1248] -ldr q8, [x17, #+1264] -sqrdmulh v10.4S, v13.4S, v8.4S -mul v13.4S, v13.4S,v16.4S -trn1 v8.4S, v12.4S, v22.4S -trn2 v16.4S, v12.4S, v22.4S -mla v29.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v1.4s -add v3.4s, v3.4s, v1.4s -ldr q1, [x17, #+1344] -ldr q18, [x17, #+1360] -sqrdmulh v30.4S, v3.4S, v18.4S -mul v3.4S, v3.4S,v1.4S -trn2 v12.2D, v28.2D, v8.2D -trn2 v22.2D, v0.2D, v16.2D -mla v13.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -ldr q29, [x17, #+1376] -ldr q18, [x17, #+1392] -sqrdmulh v1.4S, v20.4S, v18.4S -mul v20.4S, v20.4S,v29.4S -trn1 v9.2D, v28.2D, v8.2D -trn1 v11.2D, v0.2D, v16.2D -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v5.4s, v13.4s -add v5.4s, v5.4s, v13.4s -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -sub v3.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -str q15, [x0, #512] -str q10, [x0, #528] -str q5, [x0, #544] -str q30, [x0, #560] -str q7, [x0, #576] -str q1, [x0, #592] -str q14, [x0, #608] -str q3, [x0, #624] -ldr q3, [x17, #+1440] -ldr q14, [x17, #+1456] -sqrdmulh v1.4S, v2.4S, v14.4S -mul v2.4S, v2.4S,v3.4S -sqrdmulh v7.4S, v19.4S, v14.4S -mul v19.4S, v19.4S,v3.4S -mla v2.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+1568] -ldr q14, [x17, #+1584] -sqrdmulh v3.4S, v12.4S, v14.4S -mul v12.4S, v12.4S,v1.4S -mla v19.4S, v7.4S, v31.s[0] -sub v7.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v22.4S, v14.4S -mul v22.4S, v22.4S,v1.4S -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -ldr q19, [x17, #+1472] -ldr q14, [x17, #+1488] -sqrdmulh v1.4S, v6.4S, v14.4S -mul v6.4S, v6.4S,v19.4S -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -ldr q12, [x17, #+1504] -ldr q14, [x17, #+1520] -sqrdmulh v19.4S, v3.4S, v14.4S -mul v3.4S, v3.4S,v12.4S -mla v6.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -ldr q22, [x17, #+1600] -ldr q14, [x17, #+1616] -sqrdmulh v12.4S, v11.4S, v14.4S -mul v11.4S, v11.4S,v22.4S -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -ldr q6, [x17, #+1632] -ldr q14, [x17, #+1648] -sqrdmulh v22.4S, v1.4S, v14.4S -mul v1.4S, v1.4S,v6.4S -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v11.4s -add v9.4s, v9.4s, v11.4s -sub v11.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -str q21, [x0, #640] -str q19, [x0, #656] -str q7, [x0, #672] -str q12, [x0, #688] -str q9, [x0, #704] -str q22, [x0, #720] -str q2, [x0, #736] -str q11, [x0, #752] -ldr q11, [x0, #800] -ldr q2, [x0, #816] -ldr q22, [x0, #768] -ldr q9, [x0, #784] -ldr q12, [x0, #864] -ldr q7, [x0, #880] -ldr q19, [x0, #832] -ldr q21, [x0, #848] -ldr q1, [x0, #928] -ldr q3, [x0, #944] -ldr q14, [x0, #896] -ldr q6, [x0, #912] -ldr q30, [x0, #992] -ldr q5, [x0, #1008] -ldr q10, [x0, #960] -ldr q15, [x0, #976] -ldr q20, [x17, #+1664] -ldr q13, [x17, #+1680] -ldr q16, [x17, #+1792] -ldr q0, [x17, #+1808] -ldr q8, [x17, #+1920] -ldr q28, [x17, #+1936] -ldr q18, [x17, #+2048] -ldr q29, [x17, #+2064] -sqrdmulh v4.4S, v11.4S, v13.s[0] -mul v11.4S, v11.4S,v20.s[0] -sqrdmulh v17.4S, v2.4S, v13.s[0] -mul v2.4S, v2.4S,v20.s[0] -mla v11.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v12.4S, v0.s[0] -mul v12.4S, v12.4S,v16.s[0] -mla v2.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v7.4S, v0.s[0] -mul v7.4S, v7.4S,v16.s[0] -mla v12.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v2.4s -add v9.4s, v9.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v13.s[1] -mul v9.4S, v9.4S,v20.s[1] -mla v7.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v12.4s -add v19.4s, v19.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v13.s[2] -mul v4.4S, v4.4S,v20.s[2] -mla v9.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v21.4S, v0.s[1] -mul v21.4S, v21.4S,v16.s[1] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v9.4s -add v22.4s, v22.4s, v9.4s -sqrdmulh v13.4S, v2.4S, v0.s[2] -mul v2.4S, v2.4S,v16.s[2] -mla v21.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v1.4S, v28.s[0] -mul v1.4S, v1.4S,v8.s[0] -trn1 v20.4S, v22.4S, v12.4S -trn2 v9.4S, v22.4S, v12.4S -mla v2.4S, v13.4S, v31.s[0] -sub v13.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v0.4S, v3.4S, v28.s[0] -mul v3.4S, v3.4S,v8.s[0] -trn1 v16.4S, v17.4S, v7.4S -trn2 v21.4S, v17.4S, v7.4S -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v30.4S, v29.s[0] -mul v30.4S, v30.4S,v18.s[0] -trn2 v17.2D, v20.2D, v16.2D -trn2 v7.2D, v9.2D, v21.2D -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v1.4s -add v14.4s, v14.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v18.s[0] -trn1 v22.2D, v20.2D, v16.2D -trn1 v12.2D, v9.2D, v21.2D -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v6.4s, v3.4s -add v6.4s, v6.4s, v3.4s -sqrdmulh v3.4S, v6.4S, v28.s[1] -mul v6.4S, v6.4S,v8.s[1] -trn1 v21.4S, v19.4S, v13.4S -trn2 v9.4S, v19.4S, v13.4S -mla v5.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v30.4s -add v10.4s, v10.4s, v30.4s -sqrdmulh v30.4S, v2.4S, v28.s[2] -mul v2.4S, v2.4S,v8.s[2] -trn1 v16.4S, v11.4S, v4.4S -trn2 v20.4S, v11.4S, v4.4S -mla v6.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v18.s[1] -trn2 v11.2D, v21.2D, v16.2D -trn2 v4.2D, v9.2D, v20.2D -mla v2.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -sqrdmulh v28.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v18.s[2] -trn1 v19.2D, v21.2D, v16.2D -trn1 v13.2D, v9.2D, v20.2D -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sub v29.4s, v1.4s, v3.4s -add v1.4s, v1.4s, v3.4s -ldr q3, [x17, #+1696] -ldr q18, [x17, #+1712] -sqrdmulh v15.4S, v17.4S, v18.4S -mul v17.4S, v17.4S,v3.4S -trn1 v2.4S, v14.4S, v30.4S -trn2 v20.4S, v14.4S, v30.4S -sqrdmulh v9.4S, v7.4S, v18.4S -mul v7.4S, v7.4S,v3.4S -trn1 v16.4S, v0.4S, v5.4S -trn2 v21.4S, v0.4S, v5.4S -mla v17.4S, v15.4S, v31.s[0] -ldr q15, [x17, #+1824] -ldr q18, [x17, #+1840] -sqrdmulh v3.4S, v11.4S, v18.4S -mul v11.4S, v11.4S,v15.4S -trn2 v0.2D, v2.2D, v16.2D -trn2 v5.2D, v20.2D, v21.2D -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v4.4S, v18.4S -mul v4.4S, v4.4S,v15.4S -trn1 v14.2D, v2.2D, v16.2D -trn1 v30.2D, v20.2D, v21.2D -mla v11.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v7.4s -add v12.4s, v12.4s, v7.4s -ldr q7, [x17, #+1728] -ldr q21, [x17, #+1744] -sqrdmulh v20.4S, v12.4S, v21.4S -mul v12.4S, v12.4S,v7.4S -trn1 v21.4S, v10.4S, v28.4S -trn2 v7.4S, v10.4S, v28.4S -mla v4.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v11.4s -add v19.4s, v19.4s, v11.4s -ldr q11, [x17, #+1760] -ldr q16, [x17, #+1776] -sqrdmulh v2.4S, v3.4S, v16.4S -mul v3.4S, v3.4S,v11.4S -trn1 v16.4S, v1.4S, v29.4S -trn2 v11.4S, v1.4S, v29.4S -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v13.4s, v4.4s -add v13.4s, v13.4s, v4.4s -ldr q4, [x17, #+1856] -ldr q18, [x17, #+1872] -sqrdmulh v15.4S, v13.4S, v18.4S -mul v13.4S, v13.4S,v4.4S -trn2 v1.2D, v21.2D, v16.2D -trn2 v29.2D, v7.2D, v11.2D -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -ldr q12, [x17, #+1888] -ldr q18, [x17, #+1904] -sqrdmulh v4.4S, v20.4S, v18.4S -mul v20.4S, v20.4S,v12.4S -trn1 v10.2D, v21.2D, v16.2D -trn1 v28.2D, v7.2D, v11.2D -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sub v13.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -str q22, [x0, #768] -str q2, [x0, #784] -str q9, [x0, #800] -str q15, [x0, #816] -str q19, [x0, #832] -str q4, [x0, #848] -str q17, [x0, #864] -str q13, [x0, #880] -ldr q13, [x17, #+1952] -ldr q17, [x17, #+1968] -sqrdmulh v4.4S, v0.4S, v17.4S -mul v0.4S, v0.4S,v13.4S -sqrdmulh v19.4S, v5.4S, v17.4S -mul v5.4S, v5.4S,v13.4S -mla v0.4S, v4.4S, v31.s[0] -ldr q4, [x17, #+2080] -ldr q17, [x17, #+2096] -sqrdmulh v13.4S, v1.4S, v17.4S -mul v1.4S, v1.4S,v4.4S -mla v5.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v29.4S, v17.4S -mul v29.4S, v29.4S,v4.4S -mla v1.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -ldr q5, [x17, #+1984] -ldr q17, [x17, #+2000] -sqrdmulh v4.4S, v30.4S, v17.4S -mul v30.4S, v30.4S,v5.4S -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -ldr q1, [x17, #+2016] -ldr q17, [x17, #+2032] -sqrdmulh v5.4S, v13.4S, v17.4S -mul v13.4S, v13.4S,v1.4S -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -ldr q29, [x17, #+2112] -ldr q17, [x17, #+2128] -sqrdmulh v1.4S, v28.4S, v17.4S -mul v28.4S, v28.4S,v29.4S -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v14.4s, v30.4s -add v14.4s, v14.4s, v30.4s -ldr q30, [x17, #+2144] -ldr q17, [x17, #+2160] -sqrdmulh v29.4S, v4.4S, v17.4S -mul v4.4S, v4.4S,v30.4S -mla v28.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sub v28.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -str q14, [x0, #896] -str q5, [x0, #912] -str q19, [x0, #928] -str q1, [x0, #944] -str q10, [x0, #960] -str q29, [x0, #976] -str q0, [x0, #992] -str q28, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s deleted file mode 100644 index 6810a7b..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_3.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z4_3 -.global _ntt_u32_full_neon_asm_var_4_4_3_z4_3 -ntt_u32_full_neon_asm_var_4_4_3_z4_3: -_ntt_u32_full_neon_asm_var_4_4_3_z4_3: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x17, #+128] -ldr q7, [x17, #+144] -ldr q15, [x17, #+256] -ldr q10, [x0, #96] -ldr q2, [x17, #+272] -ldr q16, [x0, #112] -sqrdmulh v22.4S, v4.4S, v7.s[0] -mul v4.4S, v4.4S,v6.s[0] -sqrdmulh v13.4S, v5.4S, v7.s[0] -mul v5.4S, v5.4S,v6.s[0] -mla v4.4S, v22.4S, v31.s[0] -ldr q22, [x0, #0] -sqrdmulh v11.4S, v10.4S, v2.s[0] -ldr q21, [x0, #16] -mul v10.4S, v10.4S,v15.s[0] -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v2.s[0] -ldr q14, [x0, #160] -mul v16.4S, v16.4S,v15.s[0] -ldr q0, [x0, #176] -mla v10.4S, v11.4S, v31.s[0] -ldr q11, [x0, #64] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v7.s[1] -ldr q17, [x0, #128] -mul v21.4S, v21.4S,v6.s[1] -ldr q20, [x0, #144] -mla v16.4S, v4.4S, v31.s[0] -ldr q4, [x0, #80] -sub v3.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v7.s[2] -ldr q1, [x17, #+384] -mul v19.4S, v19.4S,v6.s[2] -ldr q9, [x17, #+400] -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v16.4s -add v4.4s, v4.4s, v16.4s -sqrdmulh v16.4S, v4.4S, v2.s[1] -ldr q12, [x0, #224] -mul v4.4S, v4.4S,v15.s[1] -ldr q8, [x0, #240] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v7.4S, v5.4S, v2.s[2] -ldr q6, [x0, #192] -mul v5.4S, v5.4S,v15.s[2] -ldr q21, [x0, #208] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v9.s[0] -ldr q18, [x17, #+512] -mul v14.4S, v14.4S,v1.s[0] -ldr q30, [x17, #+528] -trn1 v29.4S, v22.4S, v10.4S -trn2 v28.4S, v22.4S, v10.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v4.4s -add v11.4s, v11.4s, v4.4s -sqrdmulh v2.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v1.s[0] -trn1 v15.4S, v13.4S, v16.4S -trn2 v4.4S, v13.4S, v16.4S -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v5.4s -add v3.4s, v3.4s, v5.4s -sqrdmulh v5.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v18.s[0] -trn2 v13.2D, v29.2D, v15.2D -trn2 v16.2D, v28.2D, v4.2D -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v8.4S, v30.s[0] -mul v8.4S, v8.4S,v18.s[0] -trn1 v22.2D, v29.2D, v15.2D -trn1 v10.2D, v28.2D, v4.2D -mla v12.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v9.s[1] -mul v20.4S, v20.4S,v1.s[1] -trn1 v4.4S, v11.4S, v7.4S -trn2 v28.4S, v11.4S, v7.4S -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v5.4S, v9.s[2] -mul v5.4S, v5.4S,v1.s[2] -trn1 v15.4S, v3.4S, v19.4S -trn2 v29.4S, v3.4S, v19.4S -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v30.s[1] -mul v21.4S, v21.4S,v18.s[1] -trn2 v3.2D, v4.2D, v15.2D -trn2 v19.2D, v28.2D, v29.2D -mla v5.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v9.4S, v0.4S, v30.s[2] -mul v0.4S, v0.4S,v18.s[2] -trn1 v11.2D, v4.2D, v15.2D -trn1 v7.2D, v28.2D, v29.2D -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v21.4s -add v6.4s, v6.4s, v21.4s -sub v30.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+160] -ldr q18, [x17, #+176] -sqrdmulh v21.4S, v13.4S, v18.4S -mul v13.4S, v13.4S,v0.4S -trn1 v5.4S, v17.4S, v12.4S -trn2 v29.4S, v17.4S, v12.4S -sqrdmulh v28.4S, v16.4S, v18.4S -mul v16.4S, v16.4S,v0.4S -trn1 v18.4S, v2.4S, v8.4S -trn2 v0.4S, v2.4S, v8.4S -mla v13.4S, v21.4S, v31.s[0] -ldr q21, [x17, #+288] -ldr q15, [x17, #+304] -sqrdmulh v4.4S, v3.4S, v15.4S -mul v3.4S, v3.4S,v21.4S -trn2 v2.2D, v5.2D, v18.2D -trn2 v8.2D, v29.2D, v0.2D -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v15.4S -mul v19.4S, v19.4S,v21.4S -trn1 v17.2D, v5.2D, v18.2D -trn1 v12.2D, v29.2D, v0.2D -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -ldr q16, [x17, #+192] -ldr q0, [x17, #+208] -sqrdmulh v29.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v16.4S -trn1 v0.4S, v6.4S, v9.4S -trn2 v16.4S, v6.4S, v9.4S -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -ldr q3, [x17, #+224] -ldr q18, [x17, #+240] -sqrdmulh v5.4S, v4.4S, v18.4S -mul v4.4S, v4.4S,v3.4S -trn1 v18.4S, v14.4S, v30.4S -trn2 v3.4S, v14.4S, v30.4S -mla v10.4S, v29.4S, v31.s[0] -sub v29.4s, v7.4s, v19.4s -add v7.4s, v7.4s, v19.4s -ldr q19, [x17, #+320] -ldr q15, [x17, #+336] -sqrdmulh v21.4S, v7.4S, v15.4S -mul v7.4S, v7.4S,v19.4S -trn2 v14.2D, v0.2D, v18.2D -trn2 v30.2D, v16.2D, v3.2D -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -ldr q10, [x17, #+352] -ldr q15, [x17, #+368] -sqrdmulh v19.4S, v29.4S, v15.4S -mul v29.4S, v29.4S,v10.4S -trn1 v6.2D, v0.2D, v18.2D -trn1 v9.2D, v16.2D, v3.2D -mla v7.4S, v21.4S, v31.s[0] -sub v21.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v7.4s -add v11.4s, v11.4s, v7.4s -sub v7.4s, v13.4s, v29.4s -add v13.4s, v13.4s, v29.4s -ldr q29, [x17, #+416] -ldr q4, [x17, #+432] -sqrdmulh v3.4S, v2.4S, v4.4S -mul v2.4S, v2.4S,v29.4S -str q22, [x0, #0] -sqrdmulh v22.4S, v8.4S, v4.4S -str q5, [x0, #16] -mul v8.4S, v8.4S,v29.4S -str q28, [x0, #32] -mla v2.4S, v3.4S, v31.s[0] -ldr q3, [x17, #+544] -ldr q28, [x17, #+560] -sqrdmulh v4.4S, v14.4S, v28.4S -str q11, [x0, #64] -mul v14.4S, v14.4S,v3.4S -str q21, [x0, #48] -mla v8.4S, v22.4S, v31.s[0] -str q19, [x0, #80] -sub v19.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v30.4S, v28.4S -mul v30.4S, v30.4S,v3.4S -str q13, [x0, #96] -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v12.4s, v8.4s -add v12.4s, v12.4s, v8.4s -ldr q8, [x17, #+448] -ldr q13, [x17, #+464] -sqrdmulh v28.4S, v12.4S, v13.4S -mul v12.4S, v12.4S,v8.4S -str q7, [x0, #112] -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v6.4s, v14.4s -add v6.4s, v6.4s, v14.4s -ldr q14, [x17, #+480] -ldr q7, [x17, #+496] -sqrdmulh v13.4S, v4.4S, v7.4S -mul v4.4S, v4.4S,v14.4S -mla v12.4S, v28.4S, v31.s[0] -sub v28.4s, v9.4s, v30.4s -add v9.4s, v9.4s, v30.4s -ldr q30, [x17, #+576] -ldr q7, [x17, #+592] -sqrdmulh v14.4S, v9.4S, v7.4S -mul v9.4S, v9.4S,v30.4S -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -ldr q12, [x17, #+608] -ldr q7, [x17, #+624] -sqrdmulh v30.4S, v28.4S, v7.4S -mul v28.4S, v28.4S,v12.4S -ldr q7, [x0, #288] -mla v9.4S, v14.4S, v31.s[0] -ldr q14, [x0, #304] -sub v12.4s, v19.4s, v4.4s -ldr q8, [x17, #+640] -add v19.4s, v19.4s, v4.4s -ldr q4, [x17, #+656] -mla v28.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+768] -sub v3.4s, v6.4s, v9.4s -ldr q22, [x0, #352] -add v6.4s, v6.4s, v9.4s -ldr q9, [x17, #+784] -sub v21.4s, v2.4s, v28.4s -ldr q11, [x0, #368] -add v2.4s, v2.4s, v28.4s -sqrdmulh v28.4S, v7.4S, v4.s[0] -mul v7.4S, v7.4S,v8.s[0] -sqrdmulh v29.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v8.s[0] -str q17, [x0, #128] -str q13, [x0, #144] -str q19, [x0, #160] -str q12, [x0, #176] -mla v7.4S, v28.4S, v31.s[0] -ldr q28, [x0, #256] -sqrdmulh v12.4S, v22.4S, v9.s[0] -ldr q19, [x0, #272] -mul v22.4S, v22.4S,v30.s[0] -str q6, [x0, #192] -str q3, [x0, #208] -str q2, [x0, #224] -str q21, [x0, #240] -mla v14.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v7.4s -add v28.4s, v28.4s, v7.4s -sqrdmulh v7.4S, v11.4S, v9.s[0] -ldr q21, [x0, #416] -mul v11.4S, v11.4S,v30.s[0] -ldr q2, [x0, #432] -mla v22.4S, v12.4S, v31.s[0] -ldr q12, [x0, #320] -sub v3.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v4.s[1] -ldr q6, [x0, #384] -mul v19.4S, v19.4S,v8.s[1] -ldr q13, [x0, #400] -mla v11.4S, v7.4S, v31.s[0] -ldr q7, [x0, #336] -sub v17.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v4.s[2] -ldr q5, [x17, #+896] -mul v3.4S, v3.4S,v8.s[2] -ldr q16, [x17, #+912] -mla v19.4S, v14.4S, v31.s[0] -sub v14.4s, v7.4s, v11.4s -add v7.4s, v7.4s, v11.4s -sqrdmulh v11.4S, v7.4S, v9.s[1] -ldr q18, [x0, #480] -mul v7.4S, v7.4S,v30.s[1] -ldr q0, [x0, #496] -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v28.4s, v19.4s -add v28.4s, v28.4s, v19.4s -sqrdmulh v4.4S, v14.4S, v9.s[2] -ldr q8, [x0, #448] -mul v14.4S, v14.4S,v30.s[2] -ldr q19, [x0, #464] -mla v7.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v16.s[0] -ldr q15, [x17, #+1024] -mul v21.4S, v21.4S,v5.s[0] -ldr q10, [x17, #+1040] -trn1 v1.4S, v28.4S, v22.4S -trn2 v20.4S, v28.4S, v22.4S -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v12.4s, v7.4s -add v12.4s, v12.4s, v7.4s -sqrdmulh v9.4S, v2.4S, v16.s[0] -mul v2.4S, v2.4S,v5.s[0] -trn1 v30.4S, v29.4S, v11.4S -trn2 v7.4S, v29.4S, v11.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v10.s[0] -mul v18.4S, v18.4S,v15.s[0] -trn2 v29.2D, v1.2D, v30.2D -trn2 v11.2D, v20.2D, v7.2D -mla v2.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v21.4s -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v10.s[0] -mul v0.4S, v0.4S,v15.s[0] -trn1 v28.2D, v1.2D, v30.2D -trn1 v22.2D, v20.2D, v7.2D -mla v18.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v16.s[1] -mul v13.4S, v13.4S,v5.s[1] -trn1 v7.4S, v12.4S, v4.4S -trn2 v20.4S, v12.4S, v4.4S -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v14.4S, v16.s[2] -mul v14.4S, v14.4S,v5.s[2] -trn1 v30.4S, v17.4S, v3.4S -trn2 v1.4S, v17.4S, v3.4S -mla v13.4S, v2.4S, v31.s[0] -sub v2.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v10.s[1] -mul v19.4S, v19.4S,v15.s[1] -trn2 v17.2D, v7.2D, v30.2D -trn2 v3.2D, v20.2D, v1.2D -mla v14.4S, v18.4S, v31.s[0] -sub v18.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v16.4S, v2.4S, v10.s[2] -mul v2.4S, v2.4S,v15.s[2] -trn1 v12.2D, v7.2D, v30.2D -trn1 v4.2D, v20.2D, v1.2D -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v14.4s -add v9.4s, v9.4s, v14.4s -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sub v10.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -ldr q2, [x17, #+672] -ldr q15, [x17, #+688] -sqrdmulh v19.4S, v29.4S, v15.4S -mul v29.4S, v29.4S,v2.4S -trn1 v14.4S, v6.4S, v18.4S -trn2 v1.4S, v6.4S, v18.4S -sqrdmulh v20.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v2.4S -trn1 v15.4S, v9.4S, v0.4S -trn2 v2.4S, v9.4S, v0.4S -mla v29.4S, v19.4S, v31.s[0] -ldr q19, [x17, #+800] -ldr q30, [x17, #+816] -sqrdmulh v7.4S, v17.4S, v30.4S -mul v17.4S, v17.4S,v19.4S -trn2 v9.2D, v14.2D, v15.2D -trn2 v0.2D, v1.2D, v2.2D -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v3.4S, v30.4S -mul v3.4S, v3.4S,v19.4S -trn1 v6.2D, v14.2D, v15.2D -trn1 v18.2D, v1.2D, v2.2D -mla v17.4S, v7.4S, v31.s[0] -sub v7.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -ldr q11, [x17, #+704] -ldr q2, [x17, #+720] -sqrdmulh v1.4S, v22.4S, v2.4S -mul v22.4S, v22.4S,v11.4S -trn1 v2.4S, v8.4S, v16.4S -trn2 v11.4S, v8.4S, v16.4S -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -ldr q17, [x17, #+736] -ldr q15, [x17, #+752] -sqrdmulh v14.4S, v7.4S, v15.4S -mul v7.4S, v7.4S,v17.4S -trn1 v15.4S, v21.4S, v10.4S -trn2 v17.4S, v21.4S, v10.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -ldr q3, [x17, #+832] -ldr q30, [x17, #+848] -sqrdmulh v19.4S, v4.4S, v30.4S -mul v4.4S, v4.4S,v3.4S -trn2 v21.2D, v2.2D, v15.2D -trn2 v10.2D, v11.2D, v17.2D -mla v7.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v22.4s -add v28.4s, v28.4s, v22.4s -ldr q22, [x17, #+864] -ldr q30, [x17, #+880] -sqrdmulh v3.4S, v1.4S, v30.4S -mul v1.4S, v1.4S,v22.4S -trn1 v8.2D, v2.2D, v15.2D -trn1 v16.2D, v11.2D, v17.2D -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v7.4s -add v20.4s, v20.4s, v7.4s -mla v1.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v4.4s -add v12.4s, v12.4s, v4.4s -sub v4.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -ldr q1, [x17, #+928] -ldr q7, [x17, #+944] -sqrdmulh v17.4S, v9.4S, v7.4S -mul v9.4S, v9.4S,v1.4S -str q28, [x0, #256] -sqrdmulh v28.4S, v0.4S, v7.4S -str q14, [x0, #272] -mul v0.4S, v0.4S,v1.4S -str q20, [x0, #288] -mla v9.4S, v17.4S, v31.s[0] -ldr q17, [x17, #+1056] -ldr q20, [x17, #+1072] -sqrdmulh v7.4S, v21.4S, v20.4S -str q12, [x0, #320] -mul v21.4S, v21.4S,v17.4S -str q19, [x0, #304] -mla v0.4S, v28.4S, v31.s[0] -str q3, [x0, #336] -sub v3.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v20.4S -mul v10.4S, v10.4S,v17.4S -str q29, [x0, #352] -mla v21.4S, v7.4S, v31.s[0] -sub v7.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -ldr q0, [x17, #+960] -ldr q29, [x17, #+976] -sqrdmulh v20.4S, v18.4S, v29.4S -mul v18.4S, v18.4S,v0.4S -str q4, [x0, #368] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v21.4s -add v8.4s, v8.4s, v21.4s -ldr q21, [x17, #+992] -ldr q4, [x17, #+1008] -sqrdmulh v29.4S, v7.4S, v4.4S -mul v7.4S, v7.4S,v21.4S -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -ldr q10, [x17, #+1088] -ldr q4, [x17, #+1104] -sqrdmulh v21.4S, v16.4S, v4.4S -mul v16.4S, v16.4S,v10.4S -mla v7.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v18.4s -add v6.4s, v6.4s, v18.4s -ldr q18, [x17, #+1120] -ldr q4, [x17, #+1136] -sqrdmulh v10.4S, v20.4S, v4.4S -mul v20.4S, v20.4S,v18.4S -ldr q4, [x0, #544] -mla v16.4S, v21.4S, v31.s[0] -ldr q21, [x0, #560] -sub v18.4s, v3.4s, v7.4s -ldr q0, [x17, #+1152] -add v3.4s, v3.4s, v7.4s -ldr q7, [x17, #+1168] -mla v20.4S, v10.4S, v31.s[0] -ldr q10, [x17, #+1280] -sub v17.4s, v8.4s, v16.4s -ldr q28, [x0, #608] -add v8.4s, v8.4s, v16.4s -ldr q16, [x17, #+1296] -sub v19.4s, v9.4s, v20.4s -ldr q12, [x0, #624] -add v9.4s, v9.4s, v20.4s -sqrdmulh v20.4S, v4.4S, v7.s[0] -mul v4.4S, v4.4S,v0.s[0] -sqrdmulh v1.4S, v21.4S, v7.s[0] -mul v21.4S, v21.4S,v0.s[0] -str q6, [x0, #384] -str q29, [x0, #400] -str q3, [x0, #416] -str q18, [x0, #432] -mla v4.4S, v20.4S, v31.s[0] -ldr q20, [x0, #512] -sqrdmulh v18.4S, v28.4S, v16.s[0] -ldr q3, [x0, #528] -mul v28.4S, v28.4S,v10.s[0] -str q8, [x0, #448] -str q17, [x0, #464] -str q9, [x0, #480] -str q19, [x0, #496] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -sqrdmulh v4.4S, v12.4S, v16.s[0] -ldr q19, [x0, #672] -mul v12.4S, v12.4S,v10.s[0] -ldr q9, [x0, #688] -mla v28.4S, v18.4S, v31.s[0] -ldr q18, [x0, #576] -sub v17.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v7.s[1] -ldr q8, [x0, #640] -mul v3.4S, v3.4S,v0.s[1] -ldr q29, [x0, #656] -mla v12.4S, v4.4S, v31.s[0] -ldr q4, [x0, #592] -sub v6.4s, v18.4s, v28.4s -add v18.4s, v18.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v7.s[2] -ldr q14, [x17, #+1408] -mul v17.4S, v17.4S,v0.s[2] -ldr q11, [x17, #+1424] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v4.4s, v12.4s -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v16.s[1] -ldr q15, [x0, #736] -mul v4.4S, v4.4S,v10.s[1] -ldr q2, [x0, #752] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v20.4s, v3.4s -add v20.4s, v20.4s, v3.4s -sqrdmulh v7.4S, v21.4S, v16.s[2] -ldr q0, [x0, #704] -mul v21.4S, v21.4S,v10.s[2] -ldr q3, [x0, #720] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v11.s[0] -ldr q30, [x17, #+1536] -mul v19.4S, v19.4S,v14.s[0] -ldr q22, [x17, #+1552] -trn1 v5.4S, v20.4S, v28.4S -trn2 v13.4S, v20.4S, v28.4S -mla v21.4S, v7.4S, v31.s[0] -sub v7.4s, v18.4s, v4.4s -add v18.4s, v18.4s, v4.4s -sqrdmulh v16.4S, v9.4S, v11.s[0] -mul v9.4S, v9.4S,v14.s[0] -trn1 v10.4S, v1.4S, v12.4S -trn2 v4.4S, v1.4S, v12.4S -mla v19.4S, v17.4S, v31.s[0] -sub v17.4s, v6.4s, v21.4s -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v22.s[0] -mul v15.4S, v15.4S,v30.s[0] -trn2 v1.2D, v5.2D, v10.2D -trn2 v12.2D, v13.2D, v4.2D -mla v9.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v22.s[0] -mul v2.4S, v2.4S,v30.s[0] -trn1 v20.2D, v5.2D, v10.2D -trn1 v28.2D, v13.2D, v4.2D -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v29.4s, v9.4s -add v29.4s, v29.4s, v9.4s -sqrdmulh v9.4S, v29.4S, v11.s[1] -mul v29.4S, v29.4S,v14.s[1] -trn1 v4.4S, v18.4S, v7.4S -trn2 v13.4S, v18.4S, v7.4S -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v11.s[2] -mul v21.4S, v21.4S,v14.s[2] -trn1 v10.4S, v6.4S, v17.4S -trn2 v5.4S, v6.4S, v17.4S -mla v29.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v2.4s -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v22.s[1] -mul v3.4S, v3.4S,v30.s[1] -trn2 v6.2D, v4.2D, v10.2D -trn2 v17.2D, v13.2D, v5.2D -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -sqrdmulh v11.4S, v9.4S, v22.s[2] -mul v9.4S, v9.4S,v30.s[2] -trn1 v18.2D, v4.2D, v10.2D -trn1 v7.2D, v13.2D, v5.2D -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -mla v9.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sub v22.4s, v19.4s, v9.4s -add v19.4s, v19.4s, v9.4s -ldr q9, [x17, #+1184] -ldr q30, [x17, #+1200] -sqrdmulh v3.4S, v1.4S, v30.4S -mul v1.4S, v1.4S,v9.4S -trn1 v21.4S, v8.4S, v15.4S -trn2 v5.4S, v8.4S, v15.4S -sqrdmulh v13.4S, v12.4S, v30.4S -mul v12.4S, v12.4S,v9.4S -trn1 v30.4S, v16.4S, v2.4S -trn2 v9.4S, v16.4S, v2.4S -mla v1.4S, v3.4S, v31.s[0] -ldr q3, [x17, #+1312] -ldr q10, [x17, #+1328] -sqrdmulh v4.4S, v6.4S, v10.4S -mul v6.4S, v6.4S,v3.4S -trn2 v16.2D, v21.2D, v30.2D -trn2 v2.2D, v5.2D, v9.2D -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v10.4S -mul v17.4S, v17.4S,v3.4S -trn1 v8.2D, v21.2D, v30.2D -trn1 v15.2D, v5.2D, v9.2D -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v12.4s -add v28.4s, v28.4s, v12.4s -ldr q12, [x17, #+1216] -ldr q9, [x17, #+1232] -sqrdmulh v5.4S, v28.4S, v9.4S -mul v28.4S, v28.4S,v12.4S -trn1 v9.4S, v0.4S, v11.4S -trn2 v12.4S, v0.4S, v11.4S -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -ldr q6, [x17, #+1248] -ldr q30, [x17, #+1264] -sqrdmulh v21.4S, v4.4S, v30.4S -mul v4.4S, v4.4S,v6.4S -trn1 v30.4S, v19.4S, v22.4S -trn2 v6.4S, v19.4S, v22.4S -mla v28.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v17.4s -add v7.4s, v7.4s, v17.4s -ldr q17, [x17, #+1344] -ldr q10, [x17, #+1360] -sqrdmulh v3.4S, v7.4S, v10.4S -mul v7.4S, v7.4S,v17.4S -trn2 v19.2D, v9.2D, v30.2D -trn2 v22.2D, v12.2D, v6.2D -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v28.4s -add v20.4s, v20.4s, v28.4s -ldr q28, [x17, #+1376] -ldr q10, [x17, #+1392] -sqrdmulh v17.4S, v5.4S, v10.4S -mul v5.4S, v5.4S,v28.4S -trn1 v0.2D, v9.2D, v30.2D -trn1 v11.2D, v12.2D, v6.2D -mla v7.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v4.4s -add v13.4s, v13.4s, v4.4s -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v7.4s -add v18.4s, v18.4s, v7.4s -sub v7.4s, v1.4s, v5.4s -add v1.4s, v1.4s, v5.4s -ldr q5, [x17, #+1440] -ldr q4, [x17, #+1456] -sqrdmulh v6.4S, v16.4S, v4.4S -mul v16.4S, v16.4S,v5.4S -str q20, [x0, #512] -sqrdmulh v20.4S, v2.4S, v4.4S -str q21, [x0, #528] -mul v2.4S, v2.4S,v5.4S -str q13, [x0, #544] -mla v16.4S, v6.4S, v31.s[0] -ldr q6, [x17, #+1568] -ldr q13, [x17, #+1584] -sqrdmulh v4.4S, v19.4S, v13.4S -str q18, [x0, #576] -mul v19.4S, v19.4S,v6.4S -str q3, [x0, #560] -mla v2.4S, v20.4S, v31.s[0] -str q17, [x0, #592] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v13.4S -mul v22.4S, v22.4S,v6.4S -str q1, [x0, #608] -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -ldr q2, [x17, #+1472] -ldr q1, [x17, #+1488] -sqrdmulh v13.4S, v15.4S, v1.4S -mul v15.4S, v15.4S,v2.4S -str q7, [x0, #624] -mla v22.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -ldr q19, [x17, #+1504] -ldr q7, [x17, #+1520] -sqrdmulh v1.4S, v4.4S, v7.4S -mul v4.4S, v4.4S,v19.4S -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -ldr q22, [x17, #+1600] -ldr q7, [x17, #+1616] -sqrdmulh v19.4S, v11.4S, v7.4S -mul v11.4S, v11.4S,v22.4S -mla v4.4S, v1.4S, v31.s[0] -sub v1.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -ldr q15, [x17, #+1632] -ldr q7, [x17, #+1648] -sqrdmulh v22.4S, v13.4S, v7.4S -mul v13.4S, v13.4S,v15.4S -ldr q7, [x0, #800] -mla v11.4S, v19.4S, v31.s[0] -ldr q19, [x0, #816] -sub v15.4s, v17.4s, v4.4s -ldr q2, [x17, #+1664] -add v17.4s, v17.4s, v4.4s -ldr q4, [x17, #+1680] -mla v13.4S, v22.4S, v31.s[0] -ldr q22, [x17, #+1792] -sub v6.4s, v0.4s, v11.4s -ldr q20, [x0, #864] -add v0.4s, v0.4s, v11.4s -ldr q11, [x17, #+1808] -sub v3.4s, v16.4s, v13.4s -ldr q18, [x0, #880] -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v7.4S, v4.s[0] -mul v7.4S, v7.4S,v2.s[0] -sqrdmulh v5.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v2.s[0] -str q8, [x0, #640] -str q1, [x0, #656] -str q17, [x0, #672] -str q15, [x0, #688] -mla v7.4S, v13.4S, v31.s[0] -ldr q13, [x0, #768] -sqrdmulh v15.4S, v20.4S, v11.s[0] -ldr q17, [x0, #784] -mul v20.4S, v20.4S,v22.s[0] -str q0, [x0, #704] -str q6, [x0, #720] -str q16, [x0, #736] -str q3, [x0, #752] -mla v19.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v18.4S, v11.s[0] -ldr q3, [x0, #928] -mul v18.4S, v18.4S,v22.s[0] -ldr q16, [x0, #944] -mla v20.4S, v15.4S, v31.s[0] -ldr q15, [x0, #832] -sub v6.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v4.s[1] -ldr q0, [x0, #896] -mul v17.4S, v17.4S,v2.s[1] -ldr q1, [x0, #912] -mla v18.4S, v7.4S, v31.s[0] -ldr q7, [x0, #848] -sub v8.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v4.s[2] -ldr q21, [x17, #+1920] -mul v6.4S, v6.4S,v2.s[2] -ldr q12, [x17, #+1936] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -sqrdmulh v18.4S, v7.4S, v11.s[1] -ldr q30, [x0, #992] -mul v7.4S, v7.4S,v22.s[1] -ldr q9, [x0, #1008] -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v4.4S, v19.4S, v11.s[2] -ldr q2, [x0, #960] -mul v19.4S, v19.4S,v22.s[2] -ldr q17, [x0, #976] -mla v7.4S, v18.4S, v31.s[0] -sub v18.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v3.4S, v12.s[0] -ldr q10, [x17, #+2048] -mul v3.4S, v3.4S,v21.s[0] -ldr q28, [x17, #+2064] -trn1 v14.4S, v13.4S, v20.4S -trn2 v29.4S, v13.4S, v20.4S -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v7.4s -add v15.4s, v15.4s, v7.4s -sqrdmulh v11.4S, v16.4S, v12.s[0] -mul v16.4S, v16.4S,v21.s[0] -trn1 v22.4S, v5.4S, v18.4S -trn2 v7.4S, v5.4S, v18.4S -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v10.s[0] -trn2 v5.2D, v14.2D, v22.2D -trn2 v18.2D, v29.2D, v7.2D -mla v16.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v28.s[0] -mul v9.4S, v9.4S,v10.s[0] -trn1 v13.2D, v14.2D, v22.2D -trn1 v20.2D, v29.2D, v7.2D -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v12.s[1] -mul v1.4S, v1.4S,v21.s[1] -trn1 v7.4S, v15.4S, v4.4S -trn2 v29.4S, v15.4S, v4.4S -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v30.4s -add v2.4s, v2.4s, v30.4s -sqrdmulh v30.4S, v19.4S, v12.s[2] -mul v19.4S, v19.4S,v21.s[2] -trn1 v22.4S, v8.4S, v6.4S -trn2 v14.4S, v8.4S, v6.4S -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v17.4S, v28.s[1] -mul v17.4S, v17.4S,v10.s[1] -trn2 v8.2D, v7.2D, v22.2D -trn2 v6.2D, v29.2D, v14.2D -mla v19.4S, v30.4S, v31.s[0] -sub v30.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v12.4S, v16.4S, v28.s[2] -mul v16.4S, v16.4S,v10.s[2] -trn1 v15.2D, v7.2D, v22.2D -trn1 v4.2D, v29.2D, v14.2D -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sub v28.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -ldr q16, [x17, #+1696] -ldr q10, [x17, #+1712] -sqrdmulh v17.4S, v5.4S, v10.4S -mul v5.4S, v5.4S,v16.4S -trn1 v19.4S, v0.4S, v30.4S -trn2 v14.4S, v0.4S, v30.4S -sqrdmulh v29.4S, v18.4S, v10.4S -mul v18.4S, v18.4S,v16.4S -trn1 v10.4S, v11.4S, v9.4S -trn2 v16.4S, v11.4S, v9.4S -mla v5.4S, v17.4S, v31.s[0] -ldr q17, [x17, #+1824] -ldr q22, [x17, #+1840] -sqrdmulh v7.4S, v8.4S, v22.4S -mul v8.4S, v8.4S,v17.4S -trn2 v11.2D, v19.2D, v10.2D -trn2 v9.2D, v14.2D, v16.2D -mla v18.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v22.4S -mul v6.4S, v6.4S,v17.4S -trn1 v0.2D, v19.2D, v10.2D -trn1 v30.2D, v14.2D, v16.2D -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -ldr q18, [x17, #+1728] -ldr q16, [x17, #+1744] -sqrdmulh v14.4S, v20.4S, v16.4S -mul v20.4S, v20.4S,v18.4S -trn1 v16.4S, v2.4S, v12.4S -trn2 v18.4S, v2.4S, v12.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -ldr q8, [x17, #+1760] -ldr q10, [x17, #+1776] -sqrdmulh v19.4S, v7.4S, v10.4S -mul v7.4S, v7.4S,v8.4S -trn1 v10.4S, v3.4S, v28.4S -trn2 v8.4S, v3.4S, v28.4S -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -ldr q6, [x17, #+1856] -ldr q22, [x17, #+1872] -sqrdmulh v17.4S, v4.4S, v22.4S -mul v4.4S, v4.4S,v6.4S -trn2 v3.2D, v16.2D, v10.2D -trn2 v28.2D, v18.2D, v8.2D -mla v7.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -ldr q20, [x17, #+1888] -ldr q22, [x17, #+1904] -sqrdmulh v6.4S, v14.4S, v22.4S -mul v14.4S, v14.4S,v20.4S -trn1 v2.2D, v16.2D, v10.2D -trn1 v12.2D, v18.2D, v8.2D -mla v4.4S, v17.4S, v31.s[0] -sub v17.4s, v29.4s, v7.4s -add v29.4s, v29.4s, v7.4s -mla v14.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sub v4.4s, v5.4s, v14.4s -add v5.4s, v5.4s, v14.4s -ldr q14, [x17, #+1952] -ldr q7, [x17, #+1968] -sqrdmulh v8.4S, v11.4S, v7.4S -mul v11.4S, v11.4S,v14.4S -str q13, [x0, #768] -sqrdmulh v13.4S, v9.4S, v7.4S -str q19, [x0, #784] -mul v9.4S, v9.4S,v14.4S -str q29, [x0, #800] -mla v11.4S, v8.4S, v31.s[0] -ldr q8, [x17, #+2080] -ldr q29, [x17, #+2096] -sqrdmulh v7.4S, v3.4S, v29.4S -str q15, [x0, #832] -mul v3.4S, v3.4S,v8.4S -str q17, [x0, #816] -mla v9.4S, v13.4S, v31.s[0] -str q6, [x0, #848] -sub v6.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v29.4S -mul v28.4S, v28.4S,v8.4S -str q5, [x0, #864] -mla v3.4S, v7.4S, v31.s[0] -sub v7.4s, v30.4s, v9.4s -add v30.4s, v30.4s, v9.4s -ldr q9, [x17, #+1984] -ldr q5, [x17, #+2000] -sqrdmulh v29.4S, v30.4S, v5.4S -mul v30.4S, v30.4S,v9.4S -str q4, [x0, #880] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -ldr q3, [x17, #+2016] -ldr q4, [x17, #+2032] -sqrdmulh v5.4S, v7.4S, v4.4S -mul v7.4S, v7.4S,v3.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v12.4s, v28.4s -add v12.4s, v12.4s, v28.4s -ldr q28, [x17, #+2112] -ldr q4, [x17, #+2128] -sqrdmulh v3.4S, v12.4S, v4.4S -mul v12.4S, v12.4S,v28.4S -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -ldr q30, [x17, #+2144] -ldr q4, [x17, #+2160] -sqrdmulh v28.4S, v29.4S, v4.4S -mul v29.4S, v29.4S,v30.4S -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v2.4s, v12.4s -add v2.4s, v2.4s, v12.4s -sub v12.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -str q0, [x0, #896] -str q5, [x0, #912] -str q6, [x0, #928] -str q3, [x0, #944] -str q2, [x0, #960] -str q28, [x0, #976] -str q11, [x0, #992] -str q12, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s deleted file mode 100644 index 39f413d..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_3_z4_4.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_3_z4_4 -.global _ntt_u32_full_neon_asm_var_4_4_3_z4_4 -ntt_u32_full_neon_asm_var_4_4_3_z4_4: -_ntt_u32_full_neon_asm_var_4_4_3_z4_4: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x17, #+128] -ldr q7, [x17, #+144] -ldr q15, [x17, #+256] -ldr q10, [x0, #96] -ldr q2, [x17, #+272] -ldr q16, [x0, #112] -sqrdmulh v22.4S, v4.4S, v7.s[0] -mul v4.4S, v4.4S,v6.s[0] -sqrdmulh v13.4S, v5.4S, v7.s[0] -mul v5.4S, v5.4S,v6.s[0] -mla v4.4S, v22.4S, v31.s[0] -ldr q22, [x0, #0] -sqrdmulh v11.4S, v10.4S, v2.s[0] -ldr q21, [x0, #16] -mul v10.4S, v10.4S,v15.s[0] -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v2.s[0] -ldr q14, [x0, #160] -mul v16.4S, v16.4S,v15.s[0] -ldr q0, [x0, #176] -mla v10.4S, v11.4S, v31.s[0] -ldr q11, [x0, #64] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v7.s[1] -ldr q17, [x0, #128] -mul v21.4S, v21.4S,v6.s[1] -ldr q20, [x0, #144] -mla v16.4S, v4.4S, v31.s[0] -ldr q4, [x0, #80] -sub v3.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v7.s[2] -ldr q1, [x17, #+384] -mul v19.4S, v19.4S,v6.s[2] -ldr q9, [x17, #+400] -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v16.4s -add v4.4s, v4.4s, v16.4s -sqrdmulh v16.4S, v4.4S, v2.s[1] -ldr q12, [x0, #224] -mul v4.4S, v4.4S,v15.s[1] -ldr q8, [x0, #240] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v7.4S, v5.4S, v2.s[2] -ldr q6, [x0, #192] -mul v5.4S, v5.4S,v15.s[2] -ldr q21, [x0, #208] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v9.s[0] -ldr q18, [x17, #+512] -mul v14.4S, v14.4S,v1.s[0] -ldr q30, [x17, #+528] -trn1 v29.4S, v22.4S, v10.4S -trn2 v28.4S, v22.4S, v10.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v4.4s -add v11.4s, v11.4s, v4.4s -sqrdmulh v2.4S, v0.4S, v9.s[0] -mul v0.4S, v0.4S,v1.s[0] -trn1 v15.4S, v13.4S, v16.4S -trn2 v4.4S, v13.4S, v16.4S -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v5.4s -add v3.4s, v3.4s, v5.4s -sqrdmulh v5.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v18.s[0] -trn2 v13.2D, v29.2D, v15.2D -trn2 v16.2D, v28.2D, v4.2D -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v8.4S, v30.s[0] -mul v8.4S, v8.4S,v18.s[0] -trn1 v22.2D, v29.2D, v15.2D -trn1 v10.2D, v28.2D, v4.2D -mla v12.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v9.s[1] -mul v20.4S, v20.4S,v1.s[1] -trn1 v4.4S, v11.4S, v7.4S -trn2 v28.4S, v11.4S, v7.4S -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v5.4S, v9.s[2] -mul v5.4S, v5.4S,v1.s[2] -trn1 v15.4S, v3.4S, v19.4S -trn2 v29.4S, v3.4S, v19.4S -ldr q27, [x17, #+160] -ldr q26, [x17, #+176] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v30.s[1] -mul v21.4S, v21.4S,v18.s[1] -trn2 v3.2D, v4.2D, v15.2D -trn2 v19.2D, v28.2D, v29.2D -mla v5.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v9.4S, v0.4S, v30.s[2] -mul v0.4S, v0.4S,v18.s[2] -trn1 v11.2D, v4.2D, v15.2D -trn1 v7.2D, v28.2D, v29.2D -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v26.4S -mul v13.4S, v13.4S,v27.4S -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v21.4s -add v6.4s, v6.4s, v21.4s -ldr q30, [x17, #+288] -ldr q18, [x17, #+304] -sqrdmulh v21.4S, v16.4S, v26.4S -mul v16.4S, v16.4S,v27.4S -trn1 v29.4S, v17.4S, v12.4S -trn2 v28.4S, v17.4S, v12.4S -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -ldr q26, [x17, #+192] -ldr q27, [x17, #+208] -trn1 v0.4S, v2.4S, v8.4S -trn2 v15.4S, v2.4S, v8.4S -sqrdmulh v4.4S, v3.4S, v18.4S -mul v3.4S, v3.4S,v30.4S -trn2 v2.2D, v29.2D, v0.2D -trn2 v8.2D, v28.2D, v15.2D -ldr q1, [x17, #+224] -ldr q20, [x17, #+240] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v18.4S -mul v19.4S, v19.4S,v30.4S -trn1 v17.2D, v29.2D, v0.2D -trn1 v12.2D, v28.2D, v15.2D -ldr q15, [x17, #+320] -ldr q28, [x17, #+336] -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v10.4S, v27.4S -mul v10.4S, v10.4S,v26.4S -trn1 v27.4S, v6.4S, v9.4S -trn2 v26.4S, v6.4S, v9.4S -ldr q0, [x17, #+352] -ldr q29, [x17, #+368] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sqrdmulh v3.4S, v4.4S, v20.4S -mul v4.4S, v4.4S,v1.4S -trn1 v20.4S, v14.4S, v5.4S -trn2 v1.4S, v14.4S, v5.4S -mla v10.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v19.4s -add v7.4s, v7.4s, v19.4s -sqrdmulh v19.4S, v7.4S, v28.4S -mul v7.4S, v7.4S,v15.4S -ldr q18, [x17, #+416] -ldr q30, [x17, #+432] -trn2 v14.2D, v27.2D, v20.2D -trn2 v5.2D, v26.2D, v1.2D -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v29.4S -mul v16.4S, v16.4S,v0.4S -trn1 v6.2D, v27.2D, v20.2D -trn1 v9.2D, v26.2D, v1.2D -mla v7.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v30.4S -ldr q1, [x17, #+544] -ldr q26, [x17, #+560] -mul v2.4S, v2.4S,v18.4S -str q22, [x0, #0] -str q3, [x0, #16] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v11.4s, v7.4s -add v11.4s, v11.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v30.4S -mul v8.4S, v8.4S,v18.4S -str q21, [x0, #32] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -ldr q16, [x17, #+448] -ldr q21, [x17, #+464] -sqrdmulh v30.4S, v14.4S, v26.4S -str q11, [x0, #64] -mul v14.4S, v14.4S,v1.4S -str q19, [x0, #48] -mla v8.4S, v7.4S, v31.s[0] -str q10, [x0, #80] -sub v10.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -ldr q2, [x17, #+480] -ldr q7, [x17, #+496] -sqrdmulh v19.4S, v5.4S, v26.4S -mul v5.4S, v5.4S,v1.4S -str q13, [x0, #96] -mla v14.4S, v30.4S, v31.s[0] -sub v30.4s, v12.4s, v8.4s -add v12.4s, v12.4s, v8.4s -ldr q8, [x17, #+576] -ldr q13, [x17, #+592] -sqrdmulh v26.4S, v12.4S, v21.4S -mul v12.4S, v12.4S,v16.4S -ldr q21, [x17, #+608] -ldr q16, [x17, #+624] -str q4, [x0, #112] -mla v5.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v14.4s -add v6.4s, v6.4s, v14.4s -sqrdmulh v14.4S, v30.4S, v7.4S -mul v30.4S, v30.4S,v2.4S -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v13.4S -mul v9.4S, v9.4S,v8.4S -ldr q13, [x0, #288] -mla v30.4S, v14.4S, v31.s[0] -ldr q14, [x0, #304] -sub v8.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v16.4S -ldr q7, [x17, #+640] -mul v26.4S, v26.4S,v21.4S -ldr q16, [x17, #+656] -mla v9.4S, v5.4S, v31.s[0] -ldr q5, [x17, #+768] -sub v21.4s, v10.4s, v30.4s -ldr q2, [x0, #352] -add v10.4s, v10.4s, v30.4s -ldr q30, [x17, #+784] -ldr q4, [x0, #368] -sqrdmulh v1.4S, v13.4S, v16.s[0] -mul v13.4S, v13.4S,v7.s[0] -mla v26.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v14.4S, v16.s[0] -mul v14.4S, v14.4S,v7.s[0] -str q17, [x0, #128] -str q8, [x0, #144] -str q10, [x0, #160] -str q21, [x0, #176] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -ldr q26, [x0, #256] -sqrdmulh v21.4S, v2.4S, v30.s[0] -ldr q10, [x0, #272] -mul v2.4S, v2.4S,v5.s[0] -str q6, [x0, #192] -str q12, [x0, #208] -str q19, [x0, #224] -str q1, [x0, #240] -mla v14.4S, v9.4S, v31.s[0] -sub v9.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v30.s[0] -ldr q1, [x0, #416] -mul v4.4S, v4.4S,v5.s[0] -ldr q19, [x0, #432] -mla v2.4S, v21.4S, v31.s[0] -ldr q21, [x0, #320] -sub v12.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v16.s[1] -ldr q6, [x0, #384] -mul v10.4S, v10.4S,v7.s[1] -ldr q8, [x0, #400] -mla v4.4S, v13.4S, v31.s[0] -ldr q13, [x0, #336] -sub v17.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v16.s[2] -ldr q11, [x17, #+896] -mul v12.4S, v12.4S,v7.s[2] -ldr q18, [x17, #+912] -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v4.4s -add v13.4s, v13.4s, v4.4s -sqrdmulh v4.4S, v13.4S, v30.s[1] -ldr q3, [x0, #480] -mul v13.4S, v13.4S,v5.s[1] -ldr q22, [x0, #496] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v26.4s, v10.4s -add v26.4s, v26.4s, v10.4s -sqrdmulh v16.4S, v14.4S, v30.s[2] -ldr q7, [x0, #448] -mul v14.4S, v14.4S,v5.s[2] -ldr q10, [x0, #464] -mla v13.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v18.s[0] -ldr q20, [x17, #+1024] -mul v1.4S, v1.4S,v11.s[0] -ldr q27, [x17, #+1040] -trn1 v29.4S, v26.4S, v2.4S -trn2 v0.4S, v26.4S, v2.4S -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v30.4S, v19.4S, v18.s[0] -mul v19.4S, v19.4S,v11.s[0] -trn1 v5.4S, v9.4S, v4.4S -trn2 v13.4S, v9.4S, v4.4S -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v20.s[0] -trn2 v9.2D, v29.2D, v5.2D -trn2 v4.2D, v0.2D, v13.2D -mla v19.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v20.s[0] -trn1 v26.2D, v29.2D, v5.2D -trn1 v2.2D, v0.2D, v13.2D -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v19.4S, v8.4S, v18.s[1] -mul v8.4S, v8.4S,v11.s[1] -trn1 v13.4S, v21.4S, v16.4S -trn2 v0.4S, v21.4S, v16.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v18.s[2] -mul v14.4S, v14.4S,v11.s[2] -trn1 v5.4S, v17.4S, v12.4S -trn2 v29.4S, v17.4S, v12.4S -ldr q28, [x17, #+672] -ldr q15, [x17, #+688] -mla v8.4S, v19.4S, v31.s[0] -sub v19.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v27.s[1] -mul v10.4S, v10.4S,v20.s[1] -trn2 v17.2D, v13.2D, v5.2D -trn2 v12.2D, v0.2D, v29.2D -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -sqrdmulh v18.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v20.s[2] -trn1 v21.2D, v13.2D, v5.2D -trn1 v16.2D, v0.2D, v29.2D -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v14.4s -add v30.4s, v30.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v15.4S -mul v9.4S, v9.4S,v28.4S -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -ldr q27, [x17, #+800] -ldr q20, [x17, #+816] -sqrdmulh v10.4S, v4.4S, v15.4S -mul v4.4S, v4.4S,v28.4S -trn1 v29.4S, v6.4S, v3.4S -trn2 v0.4S, v6.4S, v3.4S -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v19.4s -add v1.4s, v1.4s, v19.4s -ldr q15, [x17, #+704] -ldr q28, [x17, #+720] -trn1 v19.4S, v30.4S, v22.4S -trn2 v5.4S, v30.4S, v22.4S -sqrdmulh v13.4S, v17.4S, v20.4S -mul v17.4S, v17.4S,v27.4S -trn2 v30.2D, v29.2D, v19.2D -trn2 v22.2D, v0.2D, v5.2D -ldr q11, [x17, #+736] -ldr q8, [x17, #+752] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v9.4s -add v26.4s, v26.4s, v9.4s -sqrdmulh v9.4S, v12.4S, v20.4S -mul v12.4S, v12.4S,v27.4S -trn1 v6.2D, v29.2D, v19.2D -trn1 v3.2D, v0.2D, v5.2D -ldr q5, [x17, #+832] -ldr q0, [x17, #+848] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v28.4S -mul v2.4S, v2.4S,v15.4S -trn1 v28.4S, v7.4S, v18.4S -trn2 v15.4S, v7.4S, v18.4S -ldr q19, [x17, #+864] -ldr q29, [x17, #+880] -mla v12.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v8.4S -mul v13.4S, v13.4S,v11.4S -trn1 v8.4S, v1.4S, v14.4S -trn2 v11.4S, v1.4S, v14.4S -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v0.4S -mul v16.4S, v16.4S,v5.4S -ldr q20, [x17, #+928] -ldr q27, [x17, #+944] -trn2 v1.2D, v28.2D, v8.2D -trn2 v14.2D, v15.2D, v11.2D -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v2.4s -add v26.4s, v26.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v29.4S -mul v4.4S, v4.4S,v19.4S -trn1 v7.2D, v28.2D, v8.2D -trn1 v18.2D, v15.2D, v11.2D -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v27.4S -ldr q11, [x17, #+1056] -ldr q15, [x17, #+1072] -mul v30.4S, v30.4S,v20.4S -str q26, [x0, #256] -str q17, [x0, #272] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v27.4S -mul v22.4S, v22.4S,v20.4S -str q10, [x0, #288] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v4.4s -add v9.4s, v9.4s, v4.4s -ldr q4, [x17, #+960] -ldr q10, [x17, #+976] -sqrdmulh v27.4S, v1.4S, v15.4S -str q21, [x0, #320] -mul v1.4S, v1.4S,v11.4S -str q12, [x0, #304] -mla v22.4S, v16.4S, v31.s[0] -str q2, [x0, #336] -sub v2.4s, v6.4s, v30.4s -add v6.4s, v6.4s, v30.4s -ldr q30, [x17, #+992] -ldr q16, [x17, #+1008] -sqrdmulh v12.4S, v14.4S, v15.4S -mul v14.4S, v14.4S,v11.4S -str q9, [x0, #352] -mla v1.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -ldr q22, [x17, #+1088] -ldr q9, [x17, #+1104] -sqrdmulh v15.4S, v3.4S, v10.4S -mul v3.4S, v3.4S,v4.4S -ldr q10, [x17, #+1120] -ldr q4, [x17, #+1136] -str q13, [x0, #368] -mla v14.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v1.4s -add v7.4s, v7.4s, v1.4s -sqrdmulh v1.4S, v27.4S, v16.4S -mul v27.4S, v27.4S,v30.4S -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v9.4S -mul v18.4S, v18.4S,v22.4S -ldr q9, [x0, #544] -mla v27.4S, v1.4S, v31.s[0] -ldr q1, [x0, #560] -sub v22.4s, v6.4s, v3.4s -add v6.4s, v6.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v4.4S -ldr q16, [x17, #+1152] -mul v15.4S, v15.4S,v10.4S -ldr q4, [x17, #+1168] -mla v18.4S, v14.4S, v31.s[0] -ldr q14, [x17, #+1280] -sub v10.4s, v2.4s, v27.4s -ldr q30, [x0, #608] -add v2.4s, v2.4s, v27.4s -ldr q27, [x17, #+1296] -ldr q13, [x0, #624] -sqrdmulh v11.4S, v9.4S, v4.s[0] -mul v9.4S, v9.4S,v16.s[0] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -sqrdmulh v18.4S, v1.4S, v4.s[0] -mul v1.4S, v1.4S,v16.s[0] -str q6, [x0, #384] -str q22, [x0, #400] -str q2, [x0, #416] -str q10, [x0, #432] -mla v9.4S, v11.4S, v31.s[0] -sub v11.4s, v12.4s, v15.4s -add v12.4s, v12.4s, v15.4s -ldr q15, [x0, #512] -sqrdmulh v10.4S, v30.4S, v27.s[0] -ldr q2, [x0, #528] -mul v30.4S, v30.4S,v14.s[0] -str q7, [x0, #448] -str q3, [x0, #464] -str q12, [x0, #480] -str q11, [x0, #496] -mla v1.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -sqrdmulh v9.4S, v13.4S, v27.s[0] -ldr q11, [x0, #672] -mul v13.4S, v13.4S,v14.s[0] -ldr q12, [x0, #688] -mla v30.4S, v10.4S, v31.s[0] -ldr q10, [x0, #576] -sub v3.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v4.s[1] -ldr q7, [x0, #640] -mul v2.4S, v2.4S,v16.s[1] -ldr q22, [x0, #656] -mla v13.4S, v9.4S, v31.s[0] -ldr q9, [x0, #592] -sub v6.4s, v10.4s, v30.4s -add v10.4s, v10.4s, v30.4s -sqrdmulh v30.4S, v3.4S, v4.s[2] -ldr q21, [x17, #+1408] -mul v3.4S, v3.4S,v16.s[2] -ldr q20, [x17, #+1424] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v27.s[1] -ldr q17, [x0, #736] -mul v9.4S, v9.4S,v14.s[1] -ldr q26, [x0, #752] -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v4.4S, v1.4S, v27.s[2] -ldr q16, [x0, #704] -mul v1.4S, v1.4S,v14.s[2] -ldr q2, [x0, #720] -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v20.s[0] -ldr q8, [x17, #+1536] -mul v11.4S, v11.4S,v21.s[0] -ldr q28, [x17, #+1552] -trn1 v29.4S, v15.4S, v30.4S -trn2 v19.4S, v15.4S, v30.4S -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -sqrdmulh v27.4S, v12.4S, v20.s[0] -mul v12.4S, v12.4S,v21.s[0] -trn1 v14.4S, v18.4S, v13.4S -trn2 v9.4S, v18.4S, v13.4S -mla v11.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v28.s[0] -mul v17.4S, v17.4S,v8.s[0] -trn2 v18.2D, v29.2D, v14.2D -trn2 v13.2D, v19.2D, v9.2D -mla v12.4S, v27.4S, v31.s[0] -sub v27.4s, v7.4s, v11.4s -add v7.4s, v7.4s, v11.4s -sqrdmulh v11.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v8.s[0] -trn1 v15.2D, v29.2D, v14.2D -trn1 v30.2D, v19.2D, v9.2D -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v20.s[1] -mul v22.4S, v22.4S,v21.s[1] -trn1 v9.4S, v10.4S, v4.4S -trn2 v19.4S, v10.4S, v4.4S -mla v26.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v20.s[2] -mul v1.4S, v1.4S,v21.s[2] -trn1 v14.4S, v6.4S, v3.4S -trn2 v29.4S, v6.4S, v3.4S -ldr q0, [x17, #+1184] -ldr q5, [x17, #+1200] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v2.4S, v28.s[1] -mul v2.4S, v2.4S,v8.s[1] -trn2 v6.2D, v9.2D, v14.2D -trn2 v3.2D, v19.2D, v29.2D -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v7.4s, v22.4s -add v7.4s, v7.4s, v22.4s -sqrdmulh v20.4S, v12.4S, v28.s[2] -mul v12.4S, v12.4S,v8.s[2] -trn1 v10.2D, v9.2D, v14.2D -trn1 v4.2D, v19.2D, v29.2D -mla v2.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v1.4s -add v27.4s, v27.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v5.4S -mul v18.4S, v18.4S,v0.4S -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v2.4s -add v16.4s, v16.4s, v2.4s -ldr q28, [x17, #+1312] -ldr q8, [x17, #+1328] -sqrdmulh v2.4S, v13.4S, v5.4S -mul v13.4S, v13.4S,v0.4S -trn1 v29.4S, v7.4S, v17.4S -trn2 v19.4S, v7.4S, v17.4S -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -ldr q5, [x17, #+1216] -ldr q0, [x17, #+1232] -trn1 v12.4S, v27.4S, v26.4S -trn2 v14.4S, v27.4S, v26.4S -sqrdmulh v9.4S, v6.4S, v8.4S -mul v6.4S, v6.4S,v28.4S -trn2 v27.2D, v29.2D, v12.2D -trn2 v26.2D, v19.2D, v14.2D -ldr q21, [x17, #+1248] -ldr q22, [x17, #+1264] -mla v13.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v8.4S -mul v3.4S, v3.4S,v28.4S -trn1 v7.2D, v29.2D, v12.2D -trn1 v17.2D, v19.2D, v14.2D -ldr q14, [x17, #+1344] -ldr q19, [x17, #+1360] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v13.4s -add v30.4s, v30.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v0.4S -mul v30.4S, v30.4S,v5.4S -trn1 v0.4S, v16.4S, v20.4S -trn2 v5.4S, v16.4S, v20.4S -ldr q12, [x17, #+1376] -ldr q29, [x17, #+1392] -mla v3.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v22.4S -mul v9.4S, v9.4S,v21.4S -trn1 v22.4S, v11.4S, v1.4S -trn2 v21.4S, v11.4S, v1.4S -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v4.4S, v19.4S -mul v4.4S, v4.4S,v14.4S -ldr q8, [x17, #+1440] -ldr q28, [x17, #+1456] -trn2 v11.2D, v0.2D, v22.2D -trn2 v1.2D, v5.2D, v21.2D -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v13.4S, v29.4S -mul v13.4S, v13.4S,v12.4S -trn1 v16.2D, v0.2D, v22.2D -trn1 v20.2D, v5.2D, v21.2D -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v27.4S, v28.4S -ldr q21, [x17, #+1568] -ldr q5, [x17, #+1584] -mul v27.4S, v27.4S,v8.4S -str q15, [x0, #512] -str q6, [x0, #528] -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -sqrdmulh v4.4S, v26.4S, v28.4S -mul v26.4S, v26.4S,v8.4S -str q2, [x0, #544] -mla v27.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -ldr q13, [x17, #+1472] -ldr q2, [x17, #+1488] -sqrdmulh v28.4S, v11.4S, v5.4S -str q10, [x0, #576] -mul v11.4S, v11.4S,v21.4S -str q3, [x0, #560] -mla v26.4S, v4.4S, v31.s[0] -str q30, [x0, #592] -sub v30.4s, v7.4s, v27.4s -add v7.4s, v7.4s, v27.4s -ldr q27, [x17, #+1504] -ldr q4, [x17, #+1520] -sqrdmulh v3.4S, v1.4S, v5.4S -mul v1.4S, v1.4S,v21.4S -str q18, [x0, #608] -mla v11.4S, v28.4S, v31.s[0] -sub v28.4s, v17.4s, v26.4s -add v17.4s, v17.4s, v26.4s -ldr q26, [x17, #+1600] -ldr q18, [x17, #+1616] -sqrdmulh v5.4S, v17.4S, v2.4S -mul v17.4S, v17.4S,v13.4S -ldr q2, [x17, #+1632] -ldr q13, [x17, #+1648] -str q9, [x0, #624] -mla v1.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v11.4s -add v16.4s, v16.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v4.4S -mul v28.4S, v28.4S,v27.4S -mla v17.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v18.4S -mul v20.4S, v20.4S,v26.4S -ldr q18, [x0, #800] -mla v28.4S, v11.4S, v31.s[0] -ldr q11, [x0, #816] -sub v26.4s, v7.4s, v17.4s -add v7.4s, v7.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v13.4S -ldr q4, [x17, #+1664] -mul v5.4S, v5.4S,v2.4S -ldr q13, [x17, #+1680] -mla v20.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+1792] -sub v2.4s, v30.4s, v28.4s -ldr q27, [x0, #864] -add v30.4s, v30.4s, v28.4s -ldr q28, [x17, #+1808] -ldr q9, [x0, #880] -sqrdmulh v21.4S, v18.4S, v13.s[0] -mul v18.4S, v18.4S,v4.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v13.s[0] -mul v11.4S, v11.4S,v4.s[0] -str q7, [x0, #640] -str q26, [x0, #656] -str q30, [x0, #672] -str q2, [x0, #688] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v5.4s -add v3.4s, v3.4s, v5.4s -ldr q5, [x0, #768] -sqrdmulh v2.4S, v27.4S, v28.s[0] -ldr q30, [x0, #784] -mul v27.4S, v27.4S,v1.s[0] -str q16, [x0, #704] -str q17, [x0, #720] -str q3, [x0, #736] -str q21, [x0, #752] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v18.4s -add v5.4s, v5.4s, v18.4s -sqrdmulh v18.4S, v9.4S, v28.s[0] -ldr q21, [x0, #928] -mul v9.4S, v9.4S,v1.s[0] -ldr q3, [x0, #944] -mla v27.4S, v2.4S, v31.s[0] -ldr q2, [x0, #832] -sub v17.4s, v30.4s, v11.4s -add v30.4s, v30.4s, v11.4s -sqrdmulh v11.4S, v30.4S, v13.s[1] -ldr q16, [x0, #896] -mul v30.4S, v30.4S,v4.s[1] -ldr q26, [x0, #912] -mla v9.4S, v18.4S, v31.s[0] -ldr q18, [x0, #848] -sub v7.4s, v2.4s, v27.4s -add v2.4s, v2.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v13.s[2] -ldr q10, [x17, #+1920] -mul v17.4S, v17.4S,v4.s[2] -ldr q8, [x17, #+1936] -mla v30.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v9.4s -add v18.4s, v18.4s, v9.4s -sqrdmulh v9.4S, v18.4S, v28.s[1] -ldr q6, [x0, #992] -mul v18.4S, v18.4S,v1.s[1] -ldr q15, [x0, #1008] -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v13.4S, v11.4S, v28.s[2] -ldr q4, [x0, #960] -mul v11.4S, v11.4S,v1.s[2] -ldr q30, [x0, #976] -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v8.s[0] -ldr q22, [x17, #+2048] -mul v21.4S, v21.4S,v10.s[0] -ldr q0, [x17, #+2064] -trn1 v29.4S, v5.4S, v27.4S -trn2 v12.4S, v5.4S, v27.4S -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v28.4S, v3.4S, v8.s[0] -mul v3.4S, v3.4S,v10.s[0] -trn1 v1.4S, v20.4S, v9.4S -trn2 v18.4S, v20.4S, v9.4S -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v7.4s, v11.4s -add v7.4s, v7.4s, v11.4s -sqrdmulh v11.4S, v6.4S, v0.s[0] -mul v6.4S, v6.4S,v22.s[0] -trn2 v20.2D, v29.2D, v1.2D -trn2 v9.2D, v12.2D, v18.2D -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v0.s[0] -mul v15.4S, v15.4S,v22.s[0] -trn1 v5.2D, v29.2D, v1.2D -trn1 v27.2D, v12.2D, v18.2D -mla v6.4S, v11.4S, v31.s[0] -sub v11.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v26.4S, v8.s[1] -mul v26.4S, v26.4S,v10.s[1] -trn1 v18.4S, v2.4S, v13.4S -trn2 v12.4S, v2.4S, v13.4S -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v8.s[2] -mul v11.4S, v11.4S,v10.s[2] -trn1 v1.4S, v7.4S, v17.4S -trn2 v29.4S, v7.4S, v17.4S -ldr q19, [x17, #+1696] -ldr q14, [x17, #+1712] -mla v26.4S, v3.4S, v31.s[0] -sub v3.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -sqrdmulh v15.4S, v30.4S, v0.s[1] -mul v30.4S, v30.4S,v22.s[1] -trn2 v7.2D, v18.2D, v1.2D -trn2 v17.2D, v12.2D, v29.2D -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v8.4S, v3.4S, v0.s[2] -mul v3.4S, v3.4S,v22.s[2] -trn1 v2.2D, v18.2D, v1.2D -trn1 v13.2D, v12.2D, v29.2D -mla v30.4S, v15.4S, v31.s[0] -sub v15.4s, v28.4s, v11.4s -add v28.4s, v28.4s, v11.4s -sqrdmulh v11.4S, v20.4S, v14.4S -mul v20.4S, v20.4S,v19.4S -mla v3.4S, v8.4S, v31.s[0] -sub v8.4s, v4.4s, v30.4s -add v4.4s, v4.4s, v30.4s -ldr q0, [x17, #+1824] -ldr q22, [x17, #+1840] -sqrdmulh v30.4S, v9.4S, v14.4S -mul v9.4S, v9.4S,v19.4S -trn1 v29.4S, v16.4S, v6.4S -trn2 v12.4S, v16.4S, v6.4S -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -ldr q14, [x17, #+1728] -ldr q19, [x17, #+1744] -trn1 v3.4S, v28.4S, v15.4S -trn2 v1.4S, v28.4S, v15.4S -sqrdmulh v18.4S, v7.4S, v22.4S -mul v7.4S, v7.4S,v0.4S -trn2 v28.2D, v29.2D, v3.2D -trn2 v15.2D, v12.2D, v1.2D -ldr q10, [x17, #+1760] -ldr q26, [x17, #+1776] -mla v9.4S, v30.4S, v31.s[0] -sub v30.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v22.4S -mul v17.4S, v17.4S,v0.4S -trn1 v16.2D, v29.2D, v3.2D -trn1 v6.2D, v12.2D, v1.2D -ldr q1, [x17, #+1856] -ldr q12, [x17, #+1872] -mla v7.4S, v18.4S, v31.s[0] -sub v18.4s, v27.4s, v9.4s -add v27.4s, v27.4s, v9.4s -sqrdmulh v9.4S, v27.4S, v19.4S -mul v27.4S, v27.4S,v14.4S -trn1 v19.4S, v4.4S, v8.4S -trn2 v14.4S, v4.4S, v8.4S -ldr q3, [x17, #+1888] -ldr q29, [x17, #+1904] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v18.4S, v26.4S -mul v18.4S, v18.4S,v10.4S -trn1 v26.4S, v21.4S, v11.4S -trn2 v10.4S, v21.4S, v11.4S -mla v27.4S, v9.4S, v31.s[0] -sub v9.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v12.4S -mul v13.4S, v13.4S,v1.4S -ldr q22, [x17, #+1952] -ldr q0, [x17, #+1968] -trn2 v21.2D, v19.2D, v26.2D -trn2 v11.2D, v14.2D, v10.2D -mla v18.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v27.4s -add v5.4s, v5.4s, v27.4s -sqrdmulh v27.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v3.4S -trn1 v4.2D, v19.2D, v26.2D -trn1 v8.2D, v14.2D, v10.2D -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sqrdmulh v18.4S, v28.4S, v0.4S -ldr q10, [x17, #+2080] -ldr q14, [x17, #+2096] -mul v28.4S, v28.4S,v22.4S -str q5, [x0, #768] -str q7, [x0, #784] -mla v9.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v13.4s -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v0.4S -mul v15.4S, v15.4S,v22.4S -str q30, [x0, #800] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v9.4s -add v20.4s, v20.4s, v9.4s -ldr q9, [x17, #+1984] -ldr q30, [x17, #+2000] -sqrdmulh v0.4S, v21.4S, v14.4S -str q2, [x0, #832] -mul v21.4S, v21.4S,v10.4S -str q17, [x0, #816] -mla v15.4S, v13.4S, v31.s[0] -str q27, [x0, #848] -sub v27.4s, v16.4s, v28.4s -add v16.4s, v16.4s, v28.4s -ldr q28, [x17, #+2016] -ldr q13, [x17, #+2032] -sqrdmulh v17.4S, v11.4S, v14.4S -mul v11.4S, v11.4S,v10.4S -str q20, [x0, #864] -mla v21.4S, v0.4S, v31.s[0] -sub v0.4s, v6.4s, v15.4s -add v6.4s, v6.4s, v15.4s -ldr q15, [x17, #+2112] -ldr q20, [x17, #+2128] -sqrdmulh v14.4S, v6.4S, v30.4S -mul v6.4S, v6.4S,v9.4S -ldr q30, [x17, #+2144] -ldr q9, [x17, #+2160] -str q18, [x0, #880] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v4.4s, v21.4s -add v4.4s, v4.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v13.4S -mul v0.4S, v0.4S,v28.4S -mla v6.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v20.4S -mul v8.4S, v8.4S,v15.4S -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v6.4s -add v16.4s, v16.4s, v6.4s -sqrdmulh v6.4S, v14.4S, v9.4S -mul v14.4S, v14.4S,v30.4S -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v27.4s, v0.4s -add v27.4s, v27.4s, v0.4s -mla v14.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v8.4s -add v4.4s, v4.4s, v8.4s -str q16, [x0, #896] -str q21, [x0, #912] -str q27, [x0, #928] -str q11, [x0, #944] -sub v11.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -str q4, [x0, #960] -str q6, [x0, #976] -str q17, [x0, #992] -str q11, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s deleted file mode 100644 index 807dda1..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_4_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_4_0 -.global _ntt_u32_full_neon_asm_var_4_4_4_0 -ntt_u32_full_neon_asm_var_4_4_4_0: -_ntt_u32_full_neon_asm_var_4_4_4_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #800] -ldr q29, [x0, #864] -ldr q28, [x0, #928] -ldr q27, [x0, #992] -ldr q26, [x0, #288] -ldr q25, [x0, #352] -ldr q24, [x0, #416] -ldr q23, [x0, #480] -ldr q22, [x0, #544] -ldr q21, [x0, #608] -ldr q20, [x0, #672] -ldr q19, [x0, #736] -ldr q18, [x0, #32] -ldr q17, [x0, #96] -ldr q16, [x0, #160] -ldr q3, [x0, #224] -ldr q2, [x17, #+0] -ldr q1, [x17, #+16] -ldr q0, [x17, #+32] -ldr q15, [x17, #+48] -ldr q14, [x17, #+64] -ldr q13, [x17, #+80] -ldr q12, [x17, #+96] -ldr q11, [x17, #+112] -sqrdmulh v10.4S, v30.4S, v1.s[0] -sqrdmulh v9.4S, v29.4S, v1.s[0] -sqrdmulh v8.4S, v28.4S, v1.s[0] -sqrdmulh v7.4S, v27.4S, v1.s[0] -mul v30.4S, v30.4S,v2.s[0] -mul v29.4S, v29.4S,v2.s[0] -mul v28.4S, v28.4S,v2.s[0] -mul v27.4S, v27.4S,v2.s[0] -mla v30.4S, v10.4S, v31.s[0] -mla v29.4S, v9.4S, v31.s[0] -mla v28.4S, v8.4S, v31.s[0] -mla v27.4S, v7.4S, v31.s[0] -sub v7.4s, v26.4s, v30.4s -sub v8.4s, v25.4s, v29.4s -sub v9.4s, v24.4s, v28.4s -sub v10.4s, v23.4s, v27.4s -add v26.4s, v26.4s, v30.4s -add v25.4s, v25.4s, v29.4s -add v24.4s, v24.4s, v28.4s -add v23.4s, v23.4s, v27.4s -sqrdmulh v27.4S, v22.4S, v1.s[0] -sqrdmulh v28.4S, v21.4S, v1.s[0] -sqrdmulh v29.4S, v20.4S, v1.s[0] -sqrdmulh v30.4S, v19.4S, v1.s[0] -mul v22.4S, v22.4S,v2.s[0] -mul v21.4S, v21.4S,v2.s[0] -mul v20.4S, v20.4S,v2.s[0] -mul v19.4S, v19.4S,v2.s[0] -mla v22.4S, v27.4S, v31.s[0] -mla v21.4S, v28.4S, v31.s[0] -mla v20.4S, v29.4S, v31.s[0] -mla v19.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v22.4s -sub v29.4s, v17.4s, v21.4s -sub v28.4s, v16.4s, v20.4s -sub v27.4s, v3.4s, v19.4s -add v18.4s, v18.4s, v22.4s -add v17.4s, v17.4s, v21.4s -add v16.4s, v16.4s, v20.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v1.s[1] -sqrdmulh v20.4S, v23.4S, v1.s[1] -sqrdmulh v21.4S, v26.4S, v1.s[1] -sqrdmulh v22.4S, v25.4S, v1.s[1] -mul v24.4S, v24.4S,v2.s[1] -mul v23.4S, v23.4S,v2.s[1] -mul v26.4S, v26.4S,v2.s[1] -mul v25.4S, v25.4S,v2.s[1] -mla v24.4S, v19.4S, v31.s[0] -mla v23.4S, v20.4S, v31.s[0] -mla v26.4S, v21.4S, v31.s[0] -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v24.4s -sub v21.4s, v3.4s, v23.4s -sub v20.4s, v18.4s, v26.4s -sub v19.4s, v17.4s, v25.4s -add v16.4s, v16.4s, v24.4s -add v3.4s, v3.4s, v23.4s -add v18.4s, v18.4s, v26.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v9.4S, v1.s[2] -sqrdmulh v26.4S, v10.4S, v1.s[2] -sqrdmulh v23.4S, v7.4S, v1.s[2] -sqrdmulh v24.4S, v8.4S, v1.s[2] -mul v9.4S, v9.4S,v2.s[2] -mul v10.4S, v10.4S,v2.s[2] -mul v7.4S, v7.4S,v2.s[2] -mul v8.4S, v8.4S,v2.s[2] -mla v9.4S, v25.4S, v31.s[0] -mla v10.4S, v26.4S, v31.s[0] -mla v7.4S, v23.4S, v31.s[0] -mla v8.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v9.4s -sub v23.4s, v27.4s, v10.4s -sub v26.4s, v30.4s, v7.4s -sub v25.4s, v29.4s, v8.4s -add v28.4s, v28.4s, v9.4s -add v27.4s, v27.4s, v10.4s -add v30.4s, v30.4s, v7.4s -add v29.4s, v29.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v15.s[0] -sqrdmulh v7.4S, v3.4S, v15.s[0] -sqrdmulh v10.4S, v22.4S, v15.s[1] -sqrdmulh v9.4S, v21.4S, v15.s[1] -mul v16.4S, v16.4S,v0.s[0] -mul v3.4S, v3.4S,v0.s[0] -mul v22.4S, v22.4S,v0.s[1] -mul v21.4S, v21.4S,v0.s[1] -mla v16.4S, v8.4S, v31.s[0] -mla v3.4S, v7.4S, v31.s[0] -mla v22.4S, v10.4S, v31.s[0] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v16.4s -sub v10.4s, v17.4s, v3.4s -sub v7.4s, v20.4s, v22.4s -sub v8.4s, v19.4s, v21.4s -add v18.4s, v18.4s, v16.4s -add v17.4s, v17.4s, v3.4s -add v20.4s, v20.4s, v22.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v28.4S, v15.s[2] -sqrdmulh v22.4S, v27.4S, v15.s[2] -sqrdmulh v3.4S, v24.4S, v15.s[3] -sqrdmulh v16.4S, v23.4S, v15.s[3] -mul v28.4S, v28.4S,v0.s[2] -mul v27.4S, v27.4S,v0.s[2] -mul v24.4S, v24.4S,v0.s[3] -mul v23.4S, v23.4S,v0.s[3] -mla v28.4S, v21.4S, v31.s[0] -mla v27.4S, v22.4S, v31.s[0] -mla v24.4S, v3.4S, v31.s[0] -mla v23.4S, v16.4S, v31.s[0] -sub v16.4s, v30.4s, v28.4s -sub v3.4s, v29.4s, v27.4s -sub v22.4s, v26.4s, v24.4s -sub v21.4s, v25.4s, v23.4s -add v30.4s, v30.4s, v28.4s -add v29.4s, v29.4s, v27.4s -add v26.4s, v26.4s, v24.4s -add v25.4s, v25.4s, v23.4s -sqrdmulh v23.4S, v17.4S, v13.s[0] -sqrdmulh v24.4S, v10.4S, v13.s[1] -sqrdmulh v27.4S, v19.4S, v13.s[2] -sqrdmulh v28.4S, v8.4S, v13.s[3] -mul v17.4S, v17.4S,v14.s[0] -mul v10.4S, v10.4S,v14.s[1] -mul v19.4S, v19.4S,v14.s[2] -mul v8.4S, v8.4S,v14.s[3] -mla v17.4S, v23.4S, v31.s[0] -mla v10.4S, v24.4S, v31.s[0] -mla v19.4S, v27.4S, v31.s[0] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v17.4s -sub v27.4s, v9.4s, v10.4s -sub v24.4s, v20.4s, v19.4s -sub v23.4s, v7.4s, v8.4s -add v18.4s, v18.4s, v17.4s -add v9.4s, v9.4s, v10.4s -add v20.4s, v20.4s, v19.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v29.4S, v11.s[0] -sqrdmulh v19.4S, v3.4S, v11.s[1] -sqrdmulh v10.4S, v25.4S, v11.s[2] -sqrdmulh v17.4S, v21.4S, v11.s[3] -mul v29.4S, v29.4S,v12.s[0] -mul v3.4S, v3.4S,v12.s[1] -mul v25.4S, v25.4S,v12.s[2] -mul v21.4S, v21.4S,v12.s[3] -mla v29.4S, v8.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -mla v25.4S, v10.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v30.4s, v29.4s -sub v10.4s, v16.4s, v3.4s -sub v19.4s, v26.4s, v25.4s -sub v8.4s, v22.4s, v21.4s -add v30.4s, v30.4s, v29.4s -add v16.4s, v16.4s, v3.4s -add v26.4s, v26.4s, v25.4s -add v22.4s, v22.4s, v21.4s -str q18, [x0, #32] -str q28, [x0, #96] -str q9, [x0, #160] -str q27, [x0, #224] -str q20, [x0, #288] -str q24, [x0, #352] -str q7, [x0, #416] -str q23, [x0, #480] -str q30, [x0, #544] -str q17, [x0, #608] -str q16, [x0, #672] -str q10, [x0, #736] -str q26, [x0, #800] -str q19, [x0, #864] -str q22, [x0, #928] -str q8, [x0, #992] -ldr q8, [x0, #816] -ldr q22, [x0, #880] -ldr q19, [x0, #944] -ldr q26, [x0, #1008] -ldr q10, [x0, #304] -ldr q16, [x0, #368] -ldr q17, [x0, #432] -ldr q30, [x0, #496] -ldr q23, [x0, #560] -ldr q7, [x0, #624] -ldr q24, [x0, #688] -ldr q20, [x0, #752] -ldr q27, [x0, #48] -ldr q9, [x0, #112] -ldr q28, [x0, #176] -ldr q18, [x0, #240] -sqrdmulh v21.4S, v8.4S, v1.s[0] -sqrdmulh v25.4S, v22.4S, v1.s[0] -sqrdmulh v3.4S, v19.4S, v1.s[0] -sqrdmulh v29.4S, v26.4S, v1.s[0] -mul v8.4S, v8.4S,v2.s[0] -mul v22.4S, v22.4S,v2.s[0] -mul v19.4S, v19.4S,v2.s[0] -mul v26.4S, v26.4S,v2.s[0] -mla v8.4S, v21.4S, v31.s[0] -mla v22.4S, v25.4S, v31.s[0] -mla v19.4S, v3.4S, v31.s[0] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v10.4s, v8.4s -sub v3.4s, v16.4s, v22.4s -sub v25.4s, v17.4s, v19.4s -sub v21.4s, v30.4s, v26.4s -add v10.4s, v10.4s, v8.4s -add v16.4s, v16.4s, v22.4s -add v17.4s, v17.4s, v19.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v23.4S, v1.s[0] -sqrdmulh v19.4S, v7.4S, v1.s[0] -sqrdmulh v22.4S, v24.4S, v1.s[0] -sqrdmulh v8.4S, v20.4S, v1.s[0] -mul v23.4S, v23.4S,v2.s[0] -mul v7.4S, v7.4S,v2.s[0] -mul v24.4S, v24.4S,v2.s[0] -mul v20.4S, v20.4S,v2.s[0] -mla v23.4S, v26.4S, v31.s[0] -mla v7.4S, v19.4S, v31.s[0] -mla v24.4S, v22.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v27.4s, v23.4s -sub v22.4s, v9.4s, v7.4s -sub v19.4s, v28.4s, v24.4s -sub v26.4s, v18.4s, v20.4s -add v27.4s, v27.4s, v23.4s -add v9.4s, v9.4s, v7.4s -add v28.4s, v28.4s, v24.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v1.s[1] -sqrdmulh v24.4S, v30.4S, v1.s[1] -sqrdmulh v7.4S, v10.4S, v1.s[1] -sqrdmulh v23.4S, v16.4S, v1.s[1] -mul v17.4S, v17.4S,v2.s[1] -mul v30.4S, v30.4S,v2.s[1] -mul v10.4S, v10.4S,v2.s[1] -mul v16.4S, v16.4S,v2.s[1] -mla v17.4S, v20.4S, v31.s[0] -mla v30.4S, v24.4S, v31.s[0] -mla v10.4S, v7.4S, v31.s[0] -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v28.4s, v17.4s -sub v7.4s, v18.4s, v30.4s -sub v24.4s, v27.4s, v10.4s -sub v20.4s, v9.4s, v16.4s -add v28.4s, v28.4s, v17.4s -add v18.4s, v18.4s, v30.4s -add v27.4s, v27.4s, v10.4s -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v1.s[2] -sqrdmulh v10.4S, v21.4S, v1.s[2] -sqrdmulh v30.4S, v29.4S, v1.s[2] -sqrdmulh v17.4S, v3.4S, v1.s[2] -mul v25.4S, v25.4S,v2.s[2] -mul v21.4S, v21.4S,v2.s[2] -mul v29.4S, v29.4S,v2.s[2] -mul v3.4S, v3.4S,v2.s[2] -mla v25.4S, v16.4S, v31.s[0] -mla v21.4S, v10.4S, v31.s[0] -mla v29.4S, v30.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v25.4s -sub v30.4s, v26.4s, v21.4s -sub v10.4s, v8.4s, v29.4s -sub v16.4s, v22.4s, v3.4s -add v19.4s, v19.4s, v25.4s -add v26.4s, v26.4s, v21.4s -add v8.4s, v8.4s, v29.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v28.4S, v15.s[0] -sqrdmulh v29.4S, v18.4S, v15.s[0] -sqrdmulh v21.4S, v23.4S, v15.s[1] -sqrdmulh v25.4S, v7.4S, v15.s[1] -mul v28.4S, v28.4S,v0.s[0] -mul v18.4S, v18.4S,v0.s[0] -mul v23.4S, v23.4S,v0.s[1] -mul v7.4S, v7.4S,v0.s[1] -mla v28.4S, v3.4S, v31.s[0] -mla v18.4S, v29.4S, v31.s[0] -mla v23.4S, v21.4S, v31.s[0] -mla v7.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v28.4s -sub v21.4s, v9.4s, v18.4s -sub v29.4s, v24.4s, v23.4s -sub v3.4s, v20.4s, v7.4s -add v27.4s, v27.4s, v28.4s -add v9.4s, v9.4s, v18.4s -add v24.4s, v24.4s, v23.4s -add v20.4s, v20.4s, v7.4s -sqrdmulh v7.4S, v19.4S, v15.s[2] -sqrdmulh v23.4S, v26.4S, v15.s[2] -sqrdmulh v18.4S, v17.4S, v15.s[3] -sqrdmulh v28.4S, v30.4S, v15.s[3] -mul v19.4S, v19.4S,v0.s[2] -mul v26.4S, v26.4S,v0.s[2] -mul v17.4S, v17.4S,v0.s[3] -mul v30.4S, v30.4S,v0.s[3] -mla v19.4S, v7.4S, v31.s[0] -mla v26.4S, v23.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -mla v30.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v19.4s -sub v18.4s, v22.4s, v26.4s -sub v23.4s, v10.4s, v17.4s -sub v7.4s, v16.4s, v30.4s -add v8.4s, v8.4s, v19.4s -add v22.4s, v22.4s, v26.4s -add v10.4s, v10.4s, v17.4s -add v16.4s, v16.4s, v30.4s -sqrdmulh v30.4S, v9.4S, v13.s[0] -sqrdmulh v17.4S, v21.4S, v13.s[1] -sqrdmulh v26.4S, v20.4S, v13.s[2] -sqrdmulh v19.4S, v3.4S, v13.s[3] -mul v9.4S, v9.4S,v14.s[0] -mul v21.4S, v21.4S,v14.s[1] -mul v20.4S, v20.4S,v14.s[2] -mul v3.4S, v3.4S,v14.s[3] -mla v9.4S, v30.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -mla v20.4S, v26.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v27.4s, v9.4s -sub v26.4s, v25.4s, v21.4s -sub v17.4s, v24.4s, v20.4s -sub v30.4s, v29.4s, v3.4s -add v27.4s, v27.4s, v9.4s -add v25.4s, v25.4s, v21.4s -add v24.4s, v24.4s, v20.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v11.s[0] -sqrdmulh v20.4S, v18.4S, v11.s[1] -sqrdmulh v21.4S, v16.4S, v11.s[2] -sqrdmulh v9.4S, v7.4S, v11.s[3] -mul v22.4S, v22.4S,v12.s[0] -mul v18.4S, v18.4S,v12.s[1] -mul v16.4S, v16.4S,v12.s[2] -mul v7.4S, v7.4S,v12.s[3] -mla v22.4S, v3.4S, v31.s[0] -mla v18.4S, v20.4S, v31.s[0] -mla v16.4S, v21.4S, v31.s[0] -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v22.4s -sub v21.4s, v28.4s, v18.4s -sub v20.4s, v10.4s, v16.4s -sub v3.4s, v23.4s, v7.4s -add v8.4s, v8.4s, v22.4s -add v28.4s, v28.4s, v18.4s -add v10.4s, v10.4s, v16.4s -add v23.4s, v23.4s, v7.4s -str q27, [x0, #48] -str q19, [x0, #112] -str q25, [x0, #176] -str q26, [x0, #240] -str q24, [x0, #304] -str q17, [x0, #368] -str q29, [x0, #432] -str q30, [x0, #496] -str q8, [x0, #560] -str q9, [x0, #624] -str q28, [x0, #688] -str q21, [x0, #752] -str q10, [x0, #816] -str q20, [x0, #880] -str q23, [x0, #944] -str q3, [x0, #1008] -ldr q3, [x0, #768] -ldr q23, [x0, #832] -ldr q20, [x0, #896] -ldr q10, [x0, #960] -ldr q21, [x0, #256] -ldr q28, [x0, #320] -ldr q9, [x0, #384] -ldr q8, [x0, #448] -ldr q30, [x0, #512] -ldr q29, [x0, #576] -ldr q17, [x0, #640] -ldr q24, [x0, #704] -ldr q26, [x0, #0] -ldr q25, [x0, #64] -ldr q19, [x0, #128] -ldr q27, [x0, #192] -sqrdmulh v7.4S, v3.4S, v1.s[0] -sqrdmulh v16.4S, v23.4S, v1.s[0] -sqrdmulh v18.4S, v20.4S, v1.s[0] -sqrdmulh v22.4S, v10.4S, v1.s[0] -mul v3.4S, v3.4S,v2.s[0] -mul v23.4S, v23.4S,v2.s[0] -mul v20.4S, v20.4S,v2.s[0] -mul v10.4S, v10.4S,v2.s[0] -mla v3.4S, v7.4S, v31.s[0] -mla v23.4S, v16.4S, v31.s[0] -mla v20.4S, v18.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v3.4s -sub v18.4s, v28.4s, v23.4s -sub v16.4s, v9.4s, v20.4s -sub v7.4s, v8.4s, v10.4s -add v21.4s, v21.4s, v3.4s -add v28.4s, v28.4s, v23.4s -add v9.4s, v9.4s, v20.4s -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v30.4S, v1.s[0] -sqrdmulh v20.4S, v29.4S, v1.s[0] -sqrdmulh v23.4S, v17.4S, v1.s[0] -sqrdmulh v3.4S, v24.4S, v1.s[0] -mul v30.4S, v30.4S,v2.s[0] -mul v29.4S, v29.4S,v2.s[0] -mul v17.4S, v17.4S,v2.s[0] -mul v24.4S, v24.4S,v2.s[0] -mla v30.4S, v10.4S, v31.s[0] -mla v29.4S, v20.4S, v31.s[0] -mla v17.4S, v23.4S, v31.s[0] -mla v24.4S, v3.4S, v31.s[0] -sub v3.4s, v26.4s, v30.4s -sub v23.4s, v25.4s, v29.4s -sub v20.4s, v19.4s, v17.4s -sub v10.4s, v27.4s, v24.4s -add v26.4s, v26.4s, v30.4s -add v25.4s, v25.4s, v29.4s -add v19.4s, v19.4s, v17.4s -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v9.4S, v1.s[1] -sqrdmulh v17.4S, v8.4S, v1.s[1] -sqrdmulh v29.4S, v21.4S, v1.s[1] -sqrdmulh v30.4S, v28.4S, v1.s[1] -mul v9.4S, v9.4S,v2.s[1] -mul v8.4S, v8.4S,v2.s[1] -mul v21.4S, v21.4S,v2.s[1] -mul v28.4S, v28.4S,v2.s[1] -mla v9.4S, v24.4S, v31.s[0] -mla v8.4S, v17.4S, v31.s[0] -mla v21.4S, v29.4S, v31.s[0] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v9.4s -sub v29.4s, v27.4s, v8.4s -sub v17.4s, v26.4s, v21.4s -sub v24.4s, v25.4s, v28.4s -add v19.4s, v19.4s, v9.4s -add v27.4s, v27.4s, v8.4s -add v26.4s, v26.4s, v21.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v16.4S, v1.s[2] -sqrdmulh v21.4S, v7.4S, v1.s[2] -sqrdmulh v8.4S, v22.4S, v1.s[2] -sqrdmulh v9.4S, v18.4S, v1.s[2] -mul v16.4S, v16.4S,v2.s[2] -mul v7.4S, v7.4S,v2.s[2] -mul v22.4S, v22.4S,v2.s[2] -mul v18.4S, v18.4S,v2.s[2] -mla v16.4S, v28.4S, v31.s[0] -mla v7.4S, v21.4S, v31.s[0] -mla v22.4S, v8.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v16.4s -sub v8.4s, v10.4s, v7.4s -sub v21.4s, v3.4s, v22.4s -sub v28.4s, v23.4s, v18.4s -add v20.4s, v20.4s, v16.4s -add v10.4s, v10.4s, v7.4s -add v3.4s, v3.4s, v22.4s -add v23.4s, v23.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v15.s[0] -sqrdmulh v22.4S, v27.4S, v15.s[0] -sqrdmulh v7.4S, v30.4S, v15.s[1] -sqrdmulh v16.4S, v29.4S, v15.s[1] -mul v19.4S, v19.4S,v0.s[0] -mul v27.4S, v27.4S,v0.s[0] -mul v30.4S, v30.4S,v0.s[1] -mul v29.4S, v29.4S,v0.s[1] -mla v19.4S, v18.4S, v31.s[0] -mla v27.4S, v22.4S, v31.s[0] -mla v30.4S, v7.4S, v31.s[0] -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v19.4s -sub v7.4s, v25.4s, v27.4s -sub v22.4s, v17.4s, v30.4s -sub v18.4s, v24.4s, v29.4s -add v26.4s, v26.4s, v19.4s -add v25.4s, v25.4s, v27.4s -add v17.4s, v17.4s, v30.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v20.4S, v15.s[2] -sqrdmulh v30.4S, v10.4S, v15.s[2] -sqrdmulh v27.4S, v9.4S, v15.s[3] -sqrdmulh v19.4S, v8.4S, v15.s[3] -mul v20.4S, v20.4S,v0.s[2] -mul v10.4S, v10.4S,v0.s[2] -mul v9.4S, v9.4S,v0.s[3] -mul v8.4S, v8.4S,v0.s[3] -mla v20.4S, v29.4S, v31.s[0] -mla v10.4S, v30.4S, v31.s[0] -mla v9.4S, v27.4S, v31.s[0] -mla v8.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v20.4s -sub v27.4s, v23.4s, v10.4s -sub v30.4s, v21.4s, v9.4s -sub v29.4s, v28.4s, v8.4s -add v3.4s, v3.4s, v20.4s -add v23.4s, v23.4s, v10.4s -add v21.4s, v21.4s, v9.4s -add v28.4s, v28.4s, v8.4s -sqrdmulh v8.4S, v25.4S, v13.s[0] -sqrdmulh v9.4S, v7.4S, v13.s[1] -sqrdmulh v10.4S, v24.4S, v13.s[2] -sqrdmulh v20.4S, v18.4S, v13.s[3] -mul v25.4S, v25.4S,v14.s[0] -mul v7.4S, v7.4S,v14.s[1] -mul v24.4S, v24.4S,v14.s[2] -mul v18.4S, v18.4S,v14.s[3] -mla v25.4S, v8.4S, v31.s[0] -mla v7.4S, v9.4S, v31.s[0] -mla v24.4S, v10.4S, v31.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v26.4s, v25.4s -sub v10.4s, v16.4s, v7.4s -sub v9.4s, v17.4s, v24.4s -sub v8.4s, v22.4s, v18.4s -add v26.4s, v26.4s, v25.4s -add v16.4s, v16.4s, v7.4s -add v17.4s, v17.4s, v24.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v23.4S, v11.s[0] -sqrdmulh v24.4S, v27.4S, v11.s[1] -sqrdmulh v7.4S, v28.4S, v11.s[2] -sqrdmulh v25.4S, v29.4S, v11.s[3] -mul v23.4S, v23.4S,v12.s[0] -mul v27.4S, v27.4S,v12.s[1] -mul v28.4S, v28.4S,v12.s[2] -mul v29.4S, v29.4S,v12.s[3] -mla v23.4S, v18.4S, v31.s[0] -mla v27.4S, v24.4S, v31.s[0] -mla v28.4S, v7.4S, v31.s[0] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v23.4s -sub v7.4s, v19.4s, v27.4s -sub v24.4s, v21.4s, v28.4s -sub v18.4s, v30.4s, v29.4s -add v3.4s, v3.4s, v23.4s -add v19.4s, v19.4s, v27.4s -add v21.4s, v21.4s, v28.4s -add v30.4s, v30.4s, v29.4s -str q26, [x0, #0] -str q20, [x0, #64] -str q16, [x0, #128] -str q10, [x0, #192] -str q17, [x0, #256] -str q9, [x0, #320] -str q22, [x0, #384] -str q8, [x0, #448] -str q3, [x0, #512] -str q25, [x0, #576] -str q19, [x0, #640] -str q7, [x0, #704] -str q21, [x0, #768] -str q24, [x0, #832] -str q30, [x0, #896] -str q18, [x0, #960] -ldr q18, [x0, #784] -ldr q30, [x0, #848] -ldr q24, [x0, #912] -ldr q21, [x0, #976] -ldr q7, [x0, #272] -ldr q19, [x0, #336] -ldr q25, [x0, #400] -ldr q3, [x0, #464] -ldr q8, [x0, #528] -ldr q22, [x0, #592] -ldr q9, [x0, #656] -ldr q17, [x0, #720] -ldr q10, [x0, #16] -ldr q16, [x0, #80] -ldr q20, [x0, #144] -ldr q26, [x0, #208] -sqrdmulh v29.4S, v18.4S, v1.s[0] -sqrdmulh v28.4S, v30.4S, v1.s[0] -sqrdmulh v27.4S, v24.4S, v1.s[0] -sqrdmulh v23.4S, v21.4S, v1.s[0] -mul v18.4S, v18.4S,v2.s[0] -mul v30.4S, v30.4S,v2.s[0] -mul v24.4S, v24.4S,v2.s[0] -mul v21.4S, v21.4S,v2.s[0] -mla v18.4S, v29.4S, v31.s[0] -mla v30.4S, v28.4S, v31.s[0] -mla v24.4S, v27.4S, v31.s[0] -mla v21.4S, v23.4S, v31.s[0] -sub v23.4s, v7.4s, v18.4s -sub v27.4s, v19.4s, v30.4s -sub v28.4s, v25.4s, v24.4s -sub v29.4s, v3.4s, v21.4s -add v7.4s, v7.4s, v18.4s -add v19.4s, v19.4s, v30.4s -add v25.4s, v25.4s, v24.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v1.s[0] -sqrdmulh v24.4S, v22.4S, v1.s[0] -sqrdmulh v30.4S, v9.4S, v1.s[0] -sqrdmulh v18.4S, v17.4S, v1.s[0] -mul v8.4S, v8.4S,v2.s[0] -mul v22.4S, v22.4S,v2.s[0] -mul v9.4S, v9.4S,v2.s[0] -mul v17.4S, v17.4S,v2.s[0] -mla v8.4S, v21.4S, v31.s[0] -mla v22.4S, v24.4S, v31.s[0] -mla v9.4S, v30.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v8.4s -sub v30.4s, v16.4s, v22.4s -sub v24.4s, v20.4s, v9.4s -sub v21.4s, v26.4s, v17.4s -add v10.4s, v10.4s, v8.4s -add v16.4s, v16.4s, v22.4s -add v20.4s, v20.4s, v9.4s -add v26.4s, v26.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v1.s[1] -sqrdmulh v9.4S, v3.4S, v1.s[1] -sqrdmulh v22.4S, v7.4S, v1.s[1] -sqrdmulh v8.4S, v19.4S, v1.s[1] -mul v25.4S, v25.4S,v2.s[1] -mul v3.4S, v3.4S,v2.s[1] -mul v7.4S, v7.4S,v2.s[1] -mul v19.4S, v19.4S,v2.s[1] -mla v25.4S, v17.4S, v31.s[0] -mla v3.4S, v9.4S, v31.s[0] -mla v7.4S, v22.4S, v31.s[0] -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v25.4s -sub v22.4s, v26.4s, v3.4s -sub v9.4s, v10.4s, v7.4s -sub v17.4s, v16.4s, v19.4s -add v20.4s, v20.4s, v25.4s -add v26.4s, v26.4s, v3.4s -add v10.4s, v10.4s, v7.4s -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v1.s[2] -sqrdmulh v7.4S, v29.4S, v1.s[2] -sqrdmulh v3.4S, v23.4S, v1.s[2] -sqrdmulh v25.4S, v27.4S, v1.s[2] -mul v28.4S, v28.4S,v2.s[2] -mul v29.4S, v29.4S,v2.s[2] -mul v23.4S, v23.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[2] -mla v28.4S, v19.4S, v31.s[0] -mla v29.4S, v7.4S, v31.s[0] -mla v23.4S, v3.4S, v31.s[0] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v24.4s, v28.4s -sub v3.4s, v21.4s, v29.4s -sub v7.4s, v18.4s, v23.4s -sub v19.4s, v30.4s, v27.4s -add v24.4s, v24.4s, v28.4s -add v21.4s, v21.4s, v29.4s -add v18.4s, v18.4s, v23.4s -add v30.4s, v30.4s, v27.4s -sqrdmulh v27.4S, v20.4S, v15.s[0] -sqrdmulh v23.4S, v26.4S, v15.s[0] -sqrdmulh v29.4S, v8.4S, v15.s[1] -sqrdmulh v28.4S, v22.4S, v15.s[1] -mul v20.4S, v20.4S,v0.s[0] -mul v26.4S, v26.4S,v0.s[0] -mul v8.4S, v8.4S,v0.s[1] -mul v22.4S, v22.4S,v0.s[1] -mla v20.4S, v27.4S, v31.s[0] -mla v26.4S, v23.4S, v31.s[0] -mla v8.4S, v29.4S, v31.s[0] -mla v22.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v20.4s -sub v29.4s, v16.4s, v26.4s -sub v23.4s, v9.4s, v8.4s -sub v27.4s, v17.4s, v22.4s -add v10.4s, v10.4s, v20.4s -add v16.4s, v16.4s, v26.4s -add v9.4s, v9.4s, v8.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v24.4S, v15.s[2] -sqrdmulh v8.4S, v21.4S, v15.s[2] -sqrdmulh v26.4S, v25.4S, v15.s[3] -sqrdmulh v20.4S, v3.4S, v15.s[3] -mul v24.4S, v24.4S,v0.s[2] -mul v21.4S, v21.4S,v0.s[2] -mul v25.4S, v25.4S,v0.s[3] -mul v3.4S, v3.4S,v0.s[3] -mla v24.4S, v22.4S, v31.s[0] -mla v21.4S, v8.4S, v31.s[0] -mla v25.4S, v26.4S, v31.s[0] -mla v3.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v24.4s -sub v26.4s, v30.4s, v21.4s -sub v8.4s, v7.4s, v25.4s -sub v22.4s, v19.4s, v3.4s -add v18.4s, v18.4s, v24.4s -add v30.4s, v30.4s, v21.4s -add v7.4s, v7.4s, v25.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v13.s[0] -sqrdmulh v25.4S, v29.4S, v13.s[1] -sqrdmulh v21.4S, v17.4S, v13.s[2] -sqrdmulh v24.4S, v27.4S, v13.s[3] -mul v16.4S, v16.4S,v14.s[0] -mul v29.4S, v29.4S,v14.s[1] -mul v17.4S, v17.4S,v14.s[2] -mul v27.4S, v27.4S,v14.s[3] -mla v16.4S, v3.4S, v31.s[0] -mla v29.4S, v25.4S, v31.s[0] -mla v17.4S, v21.4S, v31.s[0] -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v10.4s, v16.4s -sub v21.4s, v28.4s, v29.4s -sub v25.4s, v9.4s, v17.4s -sub v3.4s, v23.4s, v27.4s -add v10.4s, v10.4s, v16.4s -add v28.4s, v28.4s, v29.4s -add v9.4s, v9.4s, v17.4s -add v23.4s, v23.4s, v27.4s -sqrdmulh v27.4S, v30.4S, v11.s[0] -sqrdmulh v17.4S, v26.4S, v11.s[1] -sqrdmulh v29.4S, v19.4S, v11.s[2] -sqrdmulh v16.4S, v22.4S, v11.s[3] -mul v30.4S, v30.4S,v12.s[0] -mul v26.4S, v26.4S,v12.s[1] -mul v19.4S, v19.4S,v12.s[2] -mul v22.4S, v22.4S,v12.s[3] -mla v30.4S, v27.4S, v31.s[0] -mla v26.4S, v17.4S, v31.s[0] -mla v19.4S, v29.4S, v31.s[0] -mla v22.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v30.4s -sub v29.4s, v20.4s, v26.4s -sub v17.4s, v7.4s, v19.4s -sub v27.4s, v8.4s, v22.4s -add v18.4s, v18.4s, v30.4s -add v20.4s, v20.4s, v26.4s -add v7.4s, v7.4s, v19.4s -add v8.4s, v8.4s, v22.4s -str q10, [x0, #16] -str q24, [x0, #80] -str q28, [x0, #144] -str q21, [x0, #208] -str q9, [x0, #272] -str q25, [x0, #336] -str q23, [x0, #400] -str q3, [x0, #464] -str q18, [x0, #528] -str q16, [x0, #592] -str q20, [x0, #656] -str q29, [x0, #720] -str q7, [x0, #784] -str q17, [x0, #848] -str q8, [x0, #912] -str q27, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q30, [x17, #+176] -ldr q26, [x17, #+192] -ldr q19, [x17, #+208] -ldr q22, [x17, #+224] -ldr q10, [x17, #+240] -ldr q24, [x0, #32] -ldr q28, [x0, #48] -ldr q21, [x0, #0] -ldr q9, [x0, #16] -sqrdmulh v25.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v4.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v28.4S, v5.s[0] -mul v28.4S, v28.4S,v4.s[0] -mla v28.4S, v24.4S, v31.s[0] -sub v24.4s, v9.4s, v28.4s -add v9.4s, v9.4s, v28.4s -sqrdmulh v28.4S, v9.4S, v5.s[1] -mul v9.4S, v9.4S,v4.s[1] -mla v9.4S, v28.4S, v31.s[0] -sub v28.4s, v21.4s, v9.4s -add v21.4s, v21.4s, v9.4s -sqrdmulh v9.4S, v24.4S, v5.s[2] -mul v24.4S, v24.4S,v4.s[2] -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -trn1 v24.4S, v21.4S, v28.4S -trn2 v23.4S, v21.4S, v28.4S -trn1 v3.4S, v25.4S, v9.4S -trn2 v18.4S, v25.4S, v9.4S -trn2 v25.2D, v24.2D, v3.2D -trn2 v9.2D, v23.2D, v18.2D -trn1 v21.2D, v24.2D, v3.2D -trn1 v28.2D, v23.2D, v18.2D -sqrdmulh v18.4S, v25.4S, v30.4S -mul v25.4S, v25.4S,v6.4S -mla v25.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v9.4S, v30.4S -mul v9.4S, v9.4S,v6.4S -mla v9.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v9.4s -add v28.4s, v28.4s, v9.4s -sqrdmulh v9.4S, v28.4S, v19.4S -mul v28.4S, v28.4S,v26.4S -mla v28.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v25.4S, v10.4S -mul v25.4S, v25.4S,v22.4S -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v25.4s -add v18.4s, v18.4s, v25.4s -str q21, [x0, #0] -str q9, [x0, #16] -str q18, [x0, #32] -str q28, [x0, #48] -ldr q28, [x17, #+256] -ldr q18, [x17, #+272] -ldr q9, [x17, #+288] -ldr q21, [x17, #+304] -ldr q25, [x17, #+320] -ldr q23, [x17, #+336] -ldr q3, [x17, #+352] -ldr q24, [x17, #+368] -ldr q10, [x0, #96] -ldr q22, [x0, #112] -ldr q19, [x0, #64] -ldr q26, [x0, #80] -sqrdmulh v30.4S, v10.4S, v18.s[0] -mul v10.4S, v10.4S,v28.s[0] -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v18.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v26.4S, v18.s[1] -mul v26.4S, v26.4S,v28.s[1] -mla v26.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -sqrdmulh v26.4S, v10.4S, v18.s[2] -mul v10.4S, v10.4S,v28.s[2] -mla v10.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v10.4s -add v30.4s, v30.4s, v10.4s -trn1 v10.4S, v19.4S, v22.4S -trn2 v6.4S, v19.4S, v22.4S -trn1 v5.4S, v30.4S, v26.4S -trn2 v4.4S, v30.4S, v26.4S -trn2 v30.2D, v10.2D, v5.2D -trn2 v26.2D, v6.2D, v4.2D -trn1 v19.2D, v10.2D, v5.2D -trn1 v22.2D, v6.2D, v4.2D -sqrdmulh v4.4S, v30.4S, v21.4S -mul v30.4S, v30.4S,v9.4S -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v21.4S -mul v26.4S, v26.4S,v9.4S -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v22.4s, v26.4s -add v22.4s, v22.4s, v26.4s -sqrdmulh v26.4S, v22.4S, v23.4S -mul v22.4S, v22.4S,v25.4S -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v24.4S -mul v30.4S, v30.4S,v3.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v4.4s, v30.4s -add v4.4s, v4.4s, v30.4s -str q19, [x0, #64] -str q26, [x0, #80] -str q4, [x0, #96] -str q22, [x0, #112] -ldr q22, [x17, #+384] -ldr q4, [x17, #+400] -ldr q26, [x17, #+416] -ldr q19, [x17, #+432] -ldr q30, [x17, #+448] -ldr q6, [x17, #+464] -ldr q5, [x17, #+480] -ldr q10, [x17, #+496] -ldr q24, [x0, #160] -ldr q3, [x0, #176] -ldr q23, [x0, #128] -ldr q25, [x0, #144] -sqrdmulh v21.4S, v24.4S, v4.s[0] -mul v24.4S, v24.4S,v22.s[0] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v3.4S, v4.s[0] -mul v3.4S, v3.4S,v22.s[0] -mla v3.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v3.4s -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v4.s[1] -mul v25.4S, v25.4S,v22.s[1] -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v23.4s, v25.4s -add v23.4s, v23.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v4.s[2] -mul v24.4S, v24.4S,v22.s[2] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -trn1 v24.4S, v23.4S, v3.4S -trn2 v9.4S, v23.4S, v3.4S -trn1 v18.4S, v21.4S, v25.4S -trn2 v28.4S, v21.4S, v25.4S -trn2 v21.2D, v24.2D, v18.2D -trn2 v25.2D, v9.2D, v28.2D -trn1 v23.2D, v24.2D, v18.2D -trn1 v3.2D, v9.2D, v28.2D -sqrdmulh v28.4S, v21.4S, v19.4S -mul v21.4S, v21.4S,v26.4S -mla v21.4S, v28.4S, v31.s[0] -sub v28.4s, v23.4s, v21.4s -add v23.4s, v23.4s, v21.4s -sqrdmulh v21.4S, v25.4S, v19.4S -mul v25.4S, v25.4S,v26.4S -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v25.4s -add v3.4s, v3.4s, v25.4s -sqrdmulh v25.4S, v3.4S, v6.4S -mul v3.4S, v3.4S,v30.4S -mla v3.4S, v25.4S, v31.s[0] -sub v25.4s, v23.4s, v3.4s -add v23.4s, v23.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v10.4S -mul v21.4S, v21.4S,v5.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v28.4s, v21.4s -add v28.4s, v28.4s, v21.4s -str q23, [x0, #128] -str q25, [x0, #144] -str q28, [x0, #160] -str q3, [x0, #176] -ldr q3, [x17, #+512] -ldr q28, [x17, #+528] -ldr q25, [x17, #+544] -ldr q23, [x17, #+560] -ldr q21, [x17, #+576] -ldr q9, [x17, #+592] -ldr q18, [x17, #+608] -ldr q24, [x17, #+624] -ldr q10, [x0, #224] -ldr q5, [x0, #240] -ldr q6, [x0, #192] -ldr q30, [x0, #208] -sqrdmulh v19.4S, v10.4S, v28.s[0] -mul v10.4S, v10.4S,v3.s[0] -mla v10.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v28.s[0] -mul v5.4S, v5.4S,v3.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v30.4S, v28.s[1] -mul v30.4S, v30.4S,v3.s[1] -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v30.4s -add v6.4s, v6.4s, v30.4s -sqrdmulh v30.4S, v10.4S, v28.s[2] -mul v10.4S, v10.4S,v3.s[2] -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -trn1 v10.4S, v6.4S, v5.4S -trn2 v26.4S, v6.4S, v5.4S -trn1 v4.4S, v19.4S, v30.4S -trn2 v22.4S, v19.4S, v30.4S -trn2 v19.2D, v10.2D, v4.2D -trn2 v30.2D, v26.2D, v22.2D -trn1 v6.2D, v10.2D, v4.2D -trn1 v5.2D, v26.2D, v22.2D -sqrdmulh v22.4S, v19.4S, v23.4S -mul v19.4S, v19.4S,v25.4S -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v23.4S -mul v30.4S, v30.4S,v25.4S -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v9.4S -mul v5.4S, v5.4S,v21.4S -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v19.4S, v24.4S -mul v19.4S, v19.4S,v18.4S -mla v19.4S, v5.4S, v31.s[0] -sub v5.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -str q6, [x0, #192] -str q30, [x0, #208] -str q22, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q22, [x17, #+656] -ldr q30, [x17, #+672] -ldr q6, [x17, #+688] -ldr q19, [x17, #+704] -ldr q26, [x17, #+720] -ldr q4, [x17, #+736] -ldr q10, [x17, #+752] -ldr q24, [x0, #288] -ldr q18, [x0, #304] -ldr q9, [x0, #256] -ldr q21, [x0, #272] -sqrdmulh v23.4S, v24.4S, v22.s[0] -mul v24.4S, v24.4S,v5.s[0] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v24.4s -add v9.4s, v9.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v22.s[0] -mul v18.4S, v18.4S,v5.s[0] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v22.s[1] -mul v21.4S, v21.4S,v5.s[1] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -sqrdmulh v21.4S, v24.4S, v22.s[2] -mul v24.4S, v24.4S,v5.s[2] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -trn1 v24.4S, v9.4S, v18.4S -trn2 v25.4S, v9.4S, v18.4S -trn1 v28.4S, v23.4S, v21.4S -trn2 v3.4S, v23.4S, v21.4S -trn2 v23.2D, v24.2D, v28.2D -trn2 v21.2D, v25.2D, v3.2D -trn1 v9.2D, v24.2D, v28.2D -trn1 v18.2D, v25.2D, v3.2D -sqrdmulh v3.4S, v23.4S, v6.4S -mul v23.4S, v23.4S,v30.4S -mla v23.4S, v3.4S, v31.s[0] -sub v3.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v21.4S, v6.4S -mul v21.4S, v21.4S,v30.4S -mla v21.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v26.4S -mul v18.4S, v18.4S,v19.4S -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v18.4s -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v23.4S, v10.4S -mul v23.4S, v23.4S,v4.4S -mla v23.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v23.4s -add v3.4s, v3.4s, v23.4s -str q9, [x0, #256] -str q21, [x0, #272] -str q3, [x0, #288] -str q18, [x0, #304] -ldr q18, [x17, #+768] -ldr q3, [x17, #+784] -ldr q21, [x17, #+800] -ldr q9, [x17, #+816] -ldr q23, [x17, #+832] -ldr q25, [x17, #+848] -ldr q28, [x17, #+864] -ldr q24, [x17, #+880] -ldr q10, [x0, #352] -ldr q4, [x0, #368] -ldr q26, [x0, #320] -ldr q19, [x0, #336] -sqrdmulh v6.4S, v10.4S, v3.s[0] -mul v10.4S, v10.4S,v18.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v10.4s -add v26.4s, v26.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -sqrdmulh v4.4S, v19.4S, v3.s[1] -mul v19.4S, v19.4S,v18.s[1] -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v19.4s -add v26.4s, v26.4s, v19.4s -sqrdmulh v19.4S, v10.4S, v3.s[2] -mul v10.4S, v10.4S,v18.s[2] -mla v10.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -trn1 v10.4S, v26.4S, v4.4S -trn2 v30.4S, v26.4S, v4.4S -trn1 v22.4S, v6.4S, v19.4S -trn2 v5.4S, v6.4S, v19.4S -trn2 v6.2D, v10.2D, v22.2D -trn2 v19.2D, v30.2D, v5.2D -trn1 v26.2D, v10.2D, v22.2D -trn1 v4.2D, v30.2D, v5.2D -sqrdmulh v5.4S, v6.4S, v9.4S -mul v6.4S, v6.4S,v21.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v26.4s, v6.4s -add v26.4s, v26.4s, v6.4s -sqrdmulh v6.4S, v19.4S, v9.4S -mul v19.4S, v19.4S,v21.4S -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v19.4s -add v4.4s, v4.4s, v19.4s -sqrdmulh v19.4S, v4.4S, v25.4S -mul v4.4S, v4.4S,v23.4S -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v24.4S -mul v6.4S, v6.4S,v28.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -str q26, [x0, #320] -str q19, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q19, [x17, #+928] -ldr q26, [x17, #+944] -ldr q6, [x17, #+960] -ldr q30, [x17, #+976] -ldr q22, [x17, #+992] -ldr q10, [x17, #+1008] -ldr q24, [x0, #416] -ldr q28, [x0, #432] -ldr q25, [x0, #384] -ldr q23, [x0, #400] -sqrdmulh v9.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v4.s[0] -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v28.4S, v5.s[0] -mul v28.4S, v28.4S,v4.s[0] -mla v28.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v28.4s -add v23.4s, v23.4s, v28.4s -sqrdmulh v28.4S, v23.4S, v5.s[1] -mul v23.4S, v23.4S,v4.s[1] -mla v23.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v23.4s -add v25.4s, v25.4s, v23.4s -sqrdmulh v23.4S, v24.4S, v5.s[2] -mul v24.4S, v24.4S,v4.s[2] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v24.4s -add v9.4s, v9.4s, v24.4s -trn1 v24.4S, v25.4S, v28.4S -trn2 v21.4S, v25.4S, v28.4S -trn1 v3.4S, v9.4S, v23.4S -trn2 v18.4S, v9.4S, v23.4S -trn2 v9.2D, v24.2D, v3.2D -trn2 v23.2D, v21.2D, v18.2D -trn1 v25.2D, v24.2D, v3.2D -trn1 v28.2D, v21.2D, v18.2D -sqrdmulh v18.4S, v9.4S, v26.4S -mul v9.4S, v9.4S,v19.4S -mla v9.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v9.4s -add v25.4s, v25.4s, v9.4s -sqrdmulh v9.4S, v23.4S, v26.4S -mul v23.4S, v23.4S,v19.4S -mla v23.4S, v9.4S, v31.s[0] -sub v9.4s, v28.4s, v23.4s -add v28.4s, v28.4s, v23.4s -sqrdmulh v23.4S, v28.4S, v30.4S -mul v28.4S, v28.4S,v6.4S -mla v28.4S, v23.4S, v31.s[0] -sub v23.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v9.4S, v10.4S -mul v9.4S, v9.4S,v22.4S -mla v9.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v9.4s -add v18.4s, v18.4s, v9.4s -str q25, [x0, #384] -str q23, [x0, #400] -str q18, [x0, #416] -str q28, [x0, #432] -ldr q28, [x17, #+1024] -ldr q18, [x17, #+1040] -ldr q23, [x17, #+1056] -ldr q25, [x17, #+1072] -ldr q9, [x17, #+1088] -ldr q21, [x17, #+1104] -ldr q3, [x17, #+1120] -ldr q24, [x17, #+1136] -ldr q10, [x0, #480] -ldr q22, [x0, #496] -ldr q30, [x0, #448] -ldr q6, [x0, #464] -sqrdmulh v26.4S, v10.4S, v18.s[0] -mul v10.4S, v10.4S,v28.s[0] -mla v10.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v10.4s -add v30.4s, v30.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v18.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v22.4s -add v6.4s, v6.4s, v22.4s -sqrdmulh v22.4S, v6.4S, v18.s[1] -mul v6.4S, v6.4S,v28.s[1] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v6.4s -add v30.4s, v30.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v18.s[2] -mul v10.4S, v10.4S,v28.s[2] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v10.4s -add v26.4s, v26.4s, v10.4s -trn1 v10.4S, v30.4S, v22.4S -trn2 v19.4S, v30.4S, v22.4S -trn1 v5.4S, v26.4S, v6.4S -trn2 v4.4S, v26.4S, v6.4S -trn2 v26.2D, v10.2D, v5.2D -trn2 v6.2D, v19.2D, v4.2D -trn1 v30.2D, v10.2D, v5.2D -trn1 v22.2D, v19.2D, v4.2D -sqrdmulh v4.4S, v26.4S, v25.4S -mul v26.4S, v26.4S,v23.4S -mla v26.4S, v4.4S, v31.s[0] -sub v4.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v25.4S -mul v6.4S, v6.4S,v23.4S -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v6.4s -add v22.4s, v22.4s, v6.4s -sqrdmulh v6.4S, v22.4S, v21.4S -mul v22.4S, v22.4S,v9.4S -mla v22.4S, v6.4S, v31.s[0] -sub v6.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v26.4S, v24.4S -mul v26.4S, v26.4S,v3.4S -mla v26.4S, v22.4S, v31.s[0] -sub v22.4s, v4.4s, v26.4s -add v4.4s, v4.4s, v26.4s -str q30, [x0, #448] -str q6, [x0, #464] -str q4, [x0, #480] -str q22, [x0, #496] -ldr q22, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q30, [x17, #+1200] -ldr q26, [x17, #+1216] -ldr q19, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q10, [x17, #+1264] -ldr q24, [x0, #544] -ldr q3, [x0, #560] -ldr q21, [x0, #512] -ldr q9, [x0, #528] -sqrdmulh v25.4S, v24.4S, v4.s[0] -mul v24.4S, v24.4S,v22.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v3.4S, v4.s[0] -mul v3.4S, v3.4S,v22.s[0] -mla v3.4S, v24.4S, v31.s[0] -sub v24.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v4.s[1] -mul v9.4S, v9.4S,v22.s[1] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v9.4s -add v21.4s, v21.4s, v9.4s -sqrdmulh v9.4S, v24.4S, v4.s[2] -mul v24.4S, v24.4S,v22.s[2] -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -trn1 v24.4S, v21.4S, v3.4S -trn2 v23.4S, v21.4S, v3.4S -trn1 v18.4S, v25.4S, v9.4S -trn2 v28.4S, v25.4S, v9.4S -trn2 v25.2D, v24.2D, v18.2D -trn2 v9.2D, v23.2D, v28.2D -trn1 v21.2D, v24.2D, v18.2D -trn1 v3.2D, v23.2D, v28.2D -sqrdmulh v28.4S, v25.4S, v30.4S -mul v25.4S, v25.4S,v6.4S -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v9.4S, v30.4S -mul v9.4S, v9.4S,v6.4S -mla v9.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -sqrdmulh v9.4S, v3.4S, v19.4S -mul v3.4S, v3.4S,v26.4S -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v10.4S -mul v25.4S, v25.4S,v5.4S -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -str q21, [x0, #512] -str q9, [x0, #528] -str q28, [x0, #544] -str q3, [x0, #560] -ldr q3, [x17, #+1280] -ldr q28, [x17, #+1296] -ldr q9, [x17, #+1312] -ldr q21, [x17, #+1328] -ldr q25, [x17, #+1344] -ldr q23, [x17, #+1360] -ldr q18, [x17, #+1376] -ldr q24, [x17, #+1392] -ldr q10, [x0, #608] -ldr q5, [x0, #624] -ldr q19, [x0, #576] -ldr q26, [x0, #592] -sqrdmulh v30.4S, v10.4S, v28.s[0] -mul v10.4S, v10.4S,v3.s[0] -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v28.s[0] -mul v5.4S, v5.4S,v3.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v5.4s -add v26.4s, v26.4s, v5.4s -sqrdmulh v5.4S, v26.4S, v28.s[1] -mul v26.4S, v26.4S,v3.s[1] -mla v26.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -sqrdmulh v26.4S, v10.4S, v28.s[2] -mul v10.4S, v10.4S,v3.s[2] -mla v10.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v10.4s -add v30.4s, v30.4s, v10.4s -trn1 v10.4S, v19.4S, v5.4S -trn2 v6.4S, v19.4S, v5.4S -trn1 v4.4S, v30.4S, v26.4S -trn2 v22.4S, v30.4S, v26.4S -trn2 v30.2D, v10.2D, v4.2D -trn2 v26.2D, v6.2D, v22.2D -trn1 v19.2D, v10.2D, v4.2D -trn1 v5.2D, v6.2D, v22.2D -sqrdmulh v22.4S, v30.4S, v21.4S -mul v30.4S, v30.4S,v9.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v30.4s -add v19.4s, v19.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v21.4S -mul v26.4S, v26.4S,v9.4S -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v5.4s, v26.4s -add v5.4s, v5.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v23.4S -mul v5.4S, v5.4S,v25.4S -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v30.4S, v24.4S -mul v30.4S, v30.4S,v18.4S -mla v30.4S, v5.4S, v31.s[0] -sub v5.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -str q19, [x0, #576] -str q26, [x0, #592] -str q22, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q22, [x17, #+1424] -ldr q26, [x17, #+1440] -ldr q19, [x17, #+1456] -ldr q30, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q10, [x17, #+1520] -ldr q24, [x0, #672] -ldr q18, [x0, #688] -ldr q23, [x0, #640] -ldr q25, [x0, #656] -sqrdmulh v21.4S, v24.4S, v22.s[0] -mul v24.4S, v24.4S,v5.s[0] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v22.s[0] -mul v18.4S, v18.4S,v5.s[0] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v18.4s -add v25.4s, v25.4s, v18.4s -sqrdmulh v18.4S, v25.4S, v22.s[1] -mul v25.4S, v25.4S,v5.s[1] -mla v25.4S, v18.4S, v31.s[0] -sub v18.4s, v23.4s, v25.4s -add v23.4s, v23.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v22.s[2] -mul v24.4S, v24.4S,v5.s[2] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -trn1 v24.4S, v23.4S, v18.4S -trn2 v9.4S, v23.4S, v18.4S -trn1 v28.4S, v21.4S, v25.4S -trn2 v3.4S, v21.4S, v25.4S -trn2 v21.2D, v24.2D, v28.2D -trn2 v25.2D, v9.2D, v3.2D -trn1 v23.2D, v24.2D, v28.2D -trn1 v18.2D, v9.2D, v3.2D -sqrdmulh v3.4S, v21.4S, v19.4S -mul v21.4S, v21.4S,v26.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v23.4s, v21.4s -add v23.4s, v23.4s, v21.4s -sqrdmulh v21.4S, v25.4S, v19.4S -mul v25.4S, v25.4S,v26.4S -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v25.4s -add v18.4s, v18.4s, v25.4s -sqrdmulh v25.4S, v18.4S, v6.4S -mul v18.4S, v18.4S,v30.4S -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v23.4s, v18.4s -add v23.4s, v23.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v10.4S -mul v21.4S, v21.4S,v4.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -str q23, [x0, #640] -str q25, [x0, #656] -str q3, [x0, #672] -str q18, [x0, #688] -ldr q18, [x17, #+1536] -ldr q3, [x17, #+1552] -ldr q25, [x17, #+1568] -ldr q23, [x17, #+1584] -ldr q21, [x17, #+1600] -ldr q9, [x17, #+1616] -ldr q28, [x17, #+1632] -ldr q24, [x17, #+1648] -ldr q10, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q30, [x0, #720] -sqrdmulh v19.4S, v10.4S, v3.s[0] -mul v10.4S, v10.4S,v18.s[0] -mla v10.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v4.4s -add v30.4s, v30.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v3.s[1] -mul v30.4S, v30.4S,v18.s[1] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v30.4s -add v6.4s, v6.4s, v30.4s -sqrdmulh v30.4S, v10.4S, v3.s[2] -mul v10.4S, v10.4S,v18.s[2] -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -trn1 v10.4S, v6.4S, v4.4S -trn2 v26.4S, v6.4S, v4.4S -trn1 v22.4S, v19.4S, v30.4S -trn2 v5.4S, v19.4S, v30.4S -trn2 v19.2D, v10.2D, v22.2D -trn2 v30.2D, v26.2D, v5.2D -trn1 v6.2D, v10.2D, v22.2D -trn1 v4.2D, v26.2D, v5.2D -sqrdmulh v5.4S, v19.4S, v23.4S -mul v19.4S, v19.4S,v25.4S -mla v19.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v23.4S -mul v30.4S, v30.4S,v25.4S -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v30.4s -add v4.4s, v4.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v9.4S -mul v4.4S, v4.4S,v21.4S -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v19.4S, v24.4S -mul v19.4S, v19.4S,v28.4S -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -str q6, [x0, #704] -str q30, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q30, [x17, #+1696] -ldr q6, [x17, #+1712] -ldr q19, [x17, #+1728] -ldr q26, [x17, #+1744] -ldr q22, [x17, #+1760] -ldr q10, [x17, #+1776] -ldr q24, [x0, #800] -ldr q28, [x0, #816] -ldr q9, [x0, #768] -ldr q21, [x0, #784] -sqrdmulh v23.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v4.s[0] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v24.4s -add v9.4s, v9.4s, v24.4s -sqrdmulh v24.4S, v28.4S, v5.s[0] -mul v28.4S, v28.4S,v4.s[0] -mla v28.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v28.4S, v31.s[0] -sub v28.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -sqrdmulh v21.4S, v24.4S, v5.s[2] -mul v24.4S, v24.4S,v4.s[2] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -trn1 v24.4S, v9.4S, v28.4S -trn2 v25.4S, v9.4S, v28.4S -trn1 v3.4S, v23.4S, v21.4S -trn2 v18.4S, v23.4S, v21.4S -trn2 v23.2D, v24.2D, v3.2D -trn2 v21.2D, v25.2D, v18.2D -trn1 v9.2D, v24.2D, v3.2D -trn1 v28.2D, v25.2D, v18.2D -sqrdmulh v18.4S, v23.4S, v6.4S -mul v23.4S, v23.4S,v30.4S -mla v23.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v21.4S, v6.4S -mul v21.4S, v21.4S,v30.4S -mla v21.4S, v23.4S, v31.s[0] -sub v23.4s, v28.4s, v21.4s -add v28.4s, v28.4s, v21.4s -sqrdmulh v21.4S, v28.4S, v26.4S -mul v28.4S, v28.4S,v19.4S -mla v28.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v28.4s -add v9.4s, v9.4s, v28.4s -sqrdmulh v28.4S, v23.4S, v10.4S -mul v23.4S, v23.4S,v22.4S -mla v23.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -str q9, [x0, #768] -str q21, [x0, #784] -str q18, [x0, #800] -str q28, [x0, #816] -ldr q28, [x17, #+1792] -ldr q18, [x17, #+1808] -ldr q21, [x17, #+1824] -ldr q9, [x17, #+1840] -ldr q23, [x17, #+1856] -ldr q25, [x17, #+1872] -ldr q3, [x17, #+1888] -ldr q24, [x17, #+1904] -ldr q10, [x0, #864] -ldr q22, [x0, #880] -ldr q26, [x0, #832] -ldr q19, [x0, #848] -sqrdmulh v6.4S, v10.4S, v18.s[0] -mul v10.4S, v10.4S,v28.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v10.4s -add v26.4s, v26.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v18.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v18.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v26.4s, v19.4s -add v26.4s, v26.4s, v19.4s -sqrdmulh v19.4S, v10.4S, v18.s[2] -mul v10.4S, v10.4S,v28.s[2] -mla v10.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -trn1 v10.4S, v26.4S, v22.4S -trn2 v30.4S, v26.4S, v22.4S -trn1 v5.4S, v6.4S, v19.4S -trn2 v4.4S, v6.4S, v19.4S -trn2 v6.2D, v10.2D, v5.2D -trn2 v19.2D, v30.2D, v4.2D -trn1 v26.2D, v10.2D, v5.2D -trn1 v22.2D, v30.2D, v4.2D -sqrdmulh v4.4S, v6.4S, v9.4S -mul v6.4S, v6.4S,v21.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v6.4s -add v26.4s, v26.4s, v6.4s -sqrdmulh v6.4S, v19.4S, v9.4S -mul v19.4S, v19.4S,v21.4S -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v25.4S -mul v22.4S, v22.4S,v23.4S -mla v22.4S, v19.4S, v31.s[0] -sub v19.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v6.4S, v24.4S -mul v6.4S, v6.4S,v3.4S -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -str q26, [x0, #832] -str q19, [x0, #848] -str q4, [x0, #864] -str q22, [x0, #880] -ldr q22, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q19, [x17, #+1952] -ldr q26, [x17, #+1968] -ldr q6, [x17, #+1984] -ldr q30, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q10, [x17, #+2032] -ldr q24, [x0, #928] -ldr q3, [x0, #944] -ldr q25, [x0, #896] -ldr q23, [x0, #912] -sqrdmulh v9.4S, v24.4S, v4.s[0] -mul v24.4S, v24.4S,v22.s[0] -mla v24.4S, v9.4S, v31.s[0] -sub v9.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v3.4S, v4.s[0] -mul v3.4S, v3.4S,v22.s[0] -mla v3.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v3.4s -add v23.4s, v23.4s, v3.4s -sqrdmulh v3.4S, v23.4S, v4.s[1] -mul v23.4S, v23.4S,v22.s[1] -mla v23.4S, v3.4S, v31.s[0] -sub v3.4s, v25.4s, v23.4s -add v25.4s, v25.4s, v23.4s -sqrdmulh v23.4S, v24.4S, v4.s[2] -mul v24.4S, v24.4S,v22.s[2] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v24.4s -add v9.4s, v9.4s, v24.4s -trn1 v24.4S, v25.4S, v3.4S -trn2 v21.4S, v25.4S, v3.4S -trn1 v18.4S, v9.4S, v23.4S -trn2 v28.4S, v9.4S, v23.4S -trn2 v9.2D, v24.2D, v18.2D -trn2 v23.2D, v21.2D, v28.2D -trn1 v25.2D, v24.2D, v18.2D -trn1 v3.2D, v21.2D, v28.2D -sqrdmulh v28.4S, v9.4S, v26.4S -mul v9.4S, v9.4S,v19.4S -mla v9.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v9.4s -add v25.4s, v25.4s, v9.4s -sqrdmulh v9.4S, v23.4S, v26.4S -mul v23.4S, v23.4S,v19.4S -mla v23.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v23.4s -add v3.4s, v3.4s, v23.4s -sqrdmulh v23.4S, v3.4S, v30.4S -mul v3.4S, v3.4S,v6.4S -mla v3.4S, v23.4S, v31.s[0] -sub v23.4s, v25.4s, v3.4s -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v10.4S -mul v9.4S, v9.4S,v5.4S -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v28.4s, v9.4s -add v28.4s, v28.4s, v9.4s -str q25, [x0, #896] -str q23, [x0, #912] -str q28, [x0, #928] -str q3, [x0, #944] -ldr q3, [x17, #+2048] -ldr q28, [x17, #+2064] -ldr q23, [x17, #+2080] -ldr q25, [x17, #+2096] -ldr q9, [x17, #+2112] -ldr q21, [x17, #+2128] -ldr q18, [x17, #+2144] -ldr q24, [x17, #+2160] -ldr q10, [x0, #992] -ldr q5, [x0, #1008] -ldr q30, [x0, #960] -ldr q6, [x0, #976] -sqrdmulh v26.4S, v10.4S, v28.s[0] -mul v10.4S, v10.4S,v3.s[0] -mla v10.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v10.4s -add v30.4s, v30.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v28.s[0] -mul v5.4S, v5.4S,v3.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v28.s[1] -mul v6.4S, v6.4S,v3.s[1] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v6.4s -add v30.4s, v30.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v28.s[2] -mul v10.4S, v10.4S,v3.s[2] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v10.4s -add v26.4s, v26.4s, v10.4s -trn1 v10.4S, v30.4S, v5.4S -trn2 v19.4S, v30.4S, v5.4S -trn1 v4.4S, v26.4S, v6.4S -trn2 v22.4S, v26.4S, v6.4S -trn2 v26.2D, v10.2D, v4.2D -trn2 v6.2D, v19.2D, v22.2D -trn1 v30.2D, v10.2D, v4.2D -trn1 v5.2D, v19.2D, v22.2D -sqrdmulh v22.4S, v26.4S, v25.4S -mul v26.4S, v26.4S,v23.4S -mla v26.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v25.4S -mul v6.4S, v6.4S,v23.4S -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v21.4S -mul v5.4S, v5.4S,v9.4S -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v26.4S, v24.4S -mul v26.4S, v26.4S,v18.4S -mla v26.4S, v5.4S, v31.s[0] -sub v5.4s, v22.4s, v26.4s -add v22.4s, v22.4s, v26.4s -str q30, [x0, #960] -str q6, [x0, #976] -str q22, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s deleted file mode 100644 index c47ecf1..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_5_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_5_0 -.global _ntt_u32_full_neon_asm_var_4_4_5_0 -ntt_u32_full_neon_asm_var_4_4_5_0: -_ntt_u32_full_neon_asm_var_4_4_5_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #800] -ldr q29, [x0, #864] -ldr q28, [x0, #928] -ldr q27, [x0, #992] -ldr q26, [x0, #288] -ldr q25, [x0, #352] -ldr q24, [x0, #416] -ldr q23, [x0, #480] -ldr q22, [x0, #544] -ldr q21, [x0, #608] -ldr q20, [x0, #672] -ldr q19, [x0, #736] -ldr q18, [x0, #32] -ldr q17, [x0, #96] -ldr q16, [x0, #160] -ldr q3, [x0, #224] -ldr q2, [x17, #+0] -ldr q1, [x17, #+16] -ldr q0, [x17, #+32] -ldr q15, [x17, #+48] -ldr q14, [x17, #+64] -ldr q13, [x17, #+80] -ldr q12, [x17, #+96] -ldr q11, [x17, #+112] -sqrdmulh v10.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v2.s[0] -sqrdmulh v9.4S, v29.4S, v1.s[0] -mul v29.4S, v29.4S,v2.s[0] -sqrdmulh v8.4S, v28.4S, v1.s[0] -mul v28.4S, v28.4S,v2.s[0] -sqrdmulh v7.4S, v27.4S, v1.s[0] -mul v27.4S, v27.4S,v2.s[0] -mla v30.4S, v10.4S, v31.s[0] -mla v29.4S, v9.4S, v31.s[0] -mla v28.4S, v8.4S, v31.s[0] -mla v27.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v2.s[0] -sqrdmulh v8.4S, v21.4S, v1.s[0] -mul v21.4S, v21.4S,v2.s[0] -sqrdmulh v9.4S, v20.4S, v1.s[0] -mul v20.4S, v20.4S,v2.s[0] -sqrdmulh v10.4S, v19.4S, v1.s[0] -mul v19.4S, v19.4S,v2.s[0] -mla v22.4S, v7.4S, v31.s[0] -mla v21.4S, v8.4S, v31.s[0] -mla v20.4S, v9.4S, v31.s[0] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sub v30.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -sub v29.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sub v28.4s, v23.4s, v27.4s -add v23.4s, v23.4s, v27.4s -sub v27.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sub v22.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sub v20.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v1.s[1] -mul v24.4S, v24.4S,v2.s[1] -sqrdmulh v9.4S, v23.4S, v1.s[1] -mul v23.4S, v23.4S,v2.s[1] -sqrdmulh v8.4S, v26.4S, v1.s[1] -mul v26.4S, v26.4S,v2.s[1] -sqrdmulh v7.4S, v25.4S, v1.s[1] -mul v25.4S, v25.4S,v2.s[1] -mla v24.4S, v19.4S, v31.s[0] -mla v23.4S, v9.4S, v31.s[0] -mla v26.4S, v8.4S, v31.s[0] -mla v25.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v29.4S, v1.s[2] -mul v29.4S, v29.4S,v2.s[2] -sqrdmulh v8.4S, v28.4S, v1.s[2] -mul v28.4S, v28.4S,v2.s[2] -sqrdmulh v9.4S, v10.4S, v1.s[2] -mul v10.4S, v10.4S,v2.s[2] -sqrdmulh v19.4S, v30.4S, v1.s[2] -mul v30.4S, v30.4S,v2.s[2] -mla v29.4S, v7.4S, v31.s[0] -mla v28.4S, v8.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -mla v30.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v24.4s -add v16.4s, v16.4s, v24.4s -sub v24.4s, v3.4s, v23.4s -add v3.4s, v3.4s, v23.4s -sub v23.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -sub v26.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sub v25.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sub v29.4s, v20.4s, v28.4s -add v20.4s, v20.4s, v28.4s -sub v28.4s, v27.4s, v10.4s -add v27.4s, v27.4s, v10.4s -sub v10.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v15.s[0] -mul v16.4S, v16.4S,v0.s[0] -sqrdmulh v9.4S, v3.4S, v15.s[0] -mul v3.4S, v3.4S,v0.s[0] -sqrdmulh v8.4S, v19.4S, v15.s[1] -mul v19.4S, v19.4S,v0.s[1] -sqrdmulh v7.4S, v24.4S, v15.s[1] -mul v24.4S, v24.4S,v0.s[1] -mla v16.4S, v30.4S, v31.s[0] -mla v3.4S, v9.4S, v31.s[0] -mla v19.4S, v8.4S, v31.s[0] -mla v24.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v21.4S, v15.s[2] -mul v21.4S, v21.4S,v0.s[2] -sqrdmulh v8.4S, v20.4S, v15.s[2] -mul v20.4S, v20.4S,v0.s[2] -sqrdmulh v9.4S, v25.4S, v15.s[3] -mul v25.4S, v25.4S,v0.s[3] -sqrdmulh v30.4S, v29.4S, v15.s[3] -mul v29.4S, v29.4S,v0.s[3] -mla v21.4S, v7.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -mla v25.4S, v9.4S, v31.s[0] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sub v16.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sub v3.4s, v23.4s, v19.4s -add v23.4s, v23.4s, v19.4s -sub v19.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sub v24.4s, v27.4s, v21.4s -add v27.4s, v27.4s, v21.4s -sub v21.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sub v20.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sub v25.4s, v10.4s, v29.4s -add v10.4s, v10.4s, v29.4s -sqrdmulh v29.4S, v17.4S, v13.s[0] -mul v17.4S, v17.4S,v14.s[0] -sqrdmulh v9.4S, v16.4S, v13.s[1] -mul v16.4S, v16.4S,v14.s[1] -sqrdmulh v8.4S, v26.4S, v13.s[2] -mul v26.4S, v26.4S,v14.s[2] -sqrdmulh v7.4S, v19.4S, v13.s[3] -mul v19.4S, v19.4S,v14.s[3] -mla v17.4S, v29.4S, v31.s[0] -mla v16.4S, v9.4S, v31.s[0] -mla v26.4S, v8.4S, v31.s[0] -mla v19.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v22.4S, v11.s[0] -mul v22.4S, v22.4S,v12.s[0] -sqrdmulh v8.4S, v21.4S, v11.s[1] -mul v21.4S, v21.4S,v12.s[1] -sqrdmulh v9.4S, v10.4S, v11.s[2] -mul v10.4S, v10.4S,v12.s[2] -sqrdmulh v29.4S, v25.4S, v11.s[3] -mul v25.4S, v25.4S,v12.s[3] -mla v22.4S, v7.4S, v31.s[0] -mla v21.4S, v8.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -mla v25.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sub v17.4s, v30.4s, v16.4s -add v30.4s, v30.4s, v16.4s -sub v16.4s, v23.4s, v26.4s -add v23.4s, v23.4s, v26.4s -sub v26.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sub v19.4s, v27.4s, v22.4s -add v27.4s, v27.4s, v22.4s -sub v22.4s, v24.4s, v21.4s -add v24.4s, v24.4s, v21.4s -sub v21.4s, v28.4s, v10.4s -add v28.4s, v28.4s, v10.4s -sub v10.4s, v20.4s, v25.4s -add v20.4s, v20.4s, v25.4s -str q18, [x0, #32] -str q29, [x0, #96] -str q30, [x0, #160] -str q17, [x0, #224] -str q23, [x0, #288] -str q16, [x0, #352] -str q3, [x0, #416] -str q26, [x0, #480] -str q27, [x0, #544] -str q19, [x0, #608] -str q24, [x0, #672] -str q22, [x0, #736] -str q28, [x0, #800] -str q21, [x0, #864] -str q20, [x0, #928] -str q10, [x0, #992] -ldr q10, [x0, #816] -ldr q20, [x0, #880] -ldr q21, [x0, #944] -ldr q28, [x0, #1008] -ldr q22, [x0, #304] -ldr q24, [x0, #368] -ldr q19, [x0, #432] -ldr q27, [x0, #496] -ldr q26, [x0, #560] -ldr q3, [x0, #624] -ldr q16, [x0, #688] -ldr q23, [x0, #752] -ldr q17, [x0, #48] -ldr q30, [x0, #112] -ldr q29, [x0, #176] -ldr q18, [x0, #240] -sqrdmulh v25.4S, v10.4S, v1.s[0] -mul v10.4S, v10.4S,v2.s[0] -sqrdmulh v9.4S, v20.4S, v1.s[0] -mul v20.4S, v20.4S,v2.s[0] -sqrdmulh v8.4S, v21.4S, v1.s[0] -mul v21.4S, v21.4S,v2.s[0] -sqrdmulh v7.4S, v28.4S, v1.s[0] -mul v28.4S, v28.4S,v2.s[0] -mla v10.4S, v25.4S, v31.s[0] -mla v20.4S, v9.4S, v31.s[0] -mla v21.4S, v8.4S, v31.s[0] -mla v28.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v2.s[0] -sqrdmulh v8.4S, v3.4S, v1.s[0] -mul v3.4S, v3.4S,v2.s[0] -sqrdmulh v9.4S, v16.4S, v1.s[0] -mul v16.4S, v16.4S,v2.s[0] -sqrdmulh v25.4S, v23.4S, v1.s[0] -mul v23.4S, v23.4S,v2.s[0] -mla v26.4S, v7.4S, v31.s[0] -mla v3.4S, v8.4S, v31.s[0] -mla v16.4S, v9.4S, v31.s[0] -mla v23.4S, v25.4S, v31.s[0] -sub v25.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sub v10.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -sub v20.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sub v21.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sub v28.4s, v17.4s, v26.4s -add v17.4s, v17.4s, v26.4s -sub v26.4s, v30.4s, v3.4s -add v30.4s, v30.4s, v3.4s -sub v3.4s, v29.4s, v16.4s -add v29.4s, v29.4s, v16.4s -sub v16.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -sqrdmulh v23.4S, v19.4S, v1.s[1] -mul v19.4S, v19.4S,v2.s[1] -sqrdmulh v9.4S, v27.4S, v1.s[1] -mul v27.4S, v27.4S,v2.s[1] -sqrdmulh v8.4S, v22.4S, v1.s[1] -mul v22.4S, v22.4S,v2.s[1] -sqrdmulh v7.4S, v24.4S, v1.s[1] -mul v24.4S, v24.4S,v2.s[1] -mla v19.4S, v23.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -mla v22.4S, v8.4S, v31.s[0] -mla v24.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v20.4S, v1.s[2] -mul v20.4S, v20.4S,v2.s[2] -sqrdmulh v8.4S, v21.4S, v1.s[2] -mul v21.4S, v21.4S,v2.s[2] -sqrdmulh v9.4S, v25.4S, v1.s[2] -mul v25.4S, v25.4S,v2.s[2] -sqrdmulh v23.4S, v10.4S, v1.s[2] -mul v10.4S, v10.4S,v2.s[2] -mla v20.4S, v7.4S, v31.s[0] -mla v21.4S, v8.4S, v31.s[0] -mla v25.4S, v9.4S, v31.s[0] -mla v10.4S, v23.4S, v31.s[0] -sub v23.4s, v29.4s, v19.4s -add v29.4s, v29.4s, v19.4s -sub v19.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sub v27.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sub v22.4s, v30.4s, v24.4s -add v30.4s, v30.4s, v24.4s -sub v24.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -sub v20.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sub v21.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sub v25.4s, v26.4s, v10.4s -add v26.4s, v26.4s, v10.4s -sqrdmulh v10.4S, v29.4S, v15.s[0] -mul v29.4S, v29.4S,v0.s[0] -sqrdmulh v9.4S, v18.4S, v15.s[0] -mul v18.4S, v18.4S,v0.s[0] -sqrdmulh v8.4S, v23.4S, v15.s[1] -mul v23.4S, v23.4S,v0.s[1] -sqrdmulh v7.4S, v19.4S, v15.s[1] -mul v19.4S, v19.4S,v0.s[1] -mla v29.4S, v10.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -mla v23.4S, v8.4S, v31.s[0] -mla v19.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v3.4S, v15.s[2] -mul v3.4S, v3.4S,v0.s[2] -sqrdmulh v8.4S, v16.4S, v15.s[2] -mul v16.4S, v16.4S,v0.s[2] -sqrdmulh v9.4S, v24.4S, v15.s[3] -mul v24.4S, v24.4S,v0.s[3] -sqrdmulh v10.4S, v20.4S, v15.s[3] -mul v20.4S, v20.4S,v0.s[3] -mla v3.4S, v7.4S, v31.s[0] -mla v16.4S, v8.4S, v31.s[0] -mla v24.4S, v9.4S, v31.s[0] -mla v20.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -sub v29.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sub v18.4s, v27.4s, v23.4s -add v27.4s, v27.4s, v23.4s -sub v23.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -sub v19.4s, v28.4s, v3.4s -add v28.4s, v28.4s, v3.4s -sub v3.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -sub v16.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sub v24.4s, v25.4s, v20.4s -add v25.4s, v25.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v13.s[0] -mul v30.4S, v30.4S,v14.s[0] -sqrdmulh v9.4S, v29.4S, v13.s[1] -mul v29.4S, v29.4S,v14.s[1] -sqrdmulh v8.4S, v22.4S, v13.s[2] -mul v22.4S, v22.4S,v14.s[2] -sqrdmulh v7.4S, v23.4S, v13.s[3] -mul v23.4S, v23.4S,v14.s[3] -mla v30.4S, v20.4S, v31.s[0] -mla v29.4S, v9.4S, v31.s[0] -mla v22.4S, v8.4S, v31.s[0] -mla v23.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v26.4S, v11.s[0] -mul v26.4S, v26.4S,v12.s[0] -sqrdmulh v8.4S, v3.4S, v11.s[1] -mul v3.4S, v3.4S,v12.s[1] -sqrdmulh v9.4S, v25.4S, v11.s[2] -mul v25.4S, v25.4S,v12.s[2] -sqrdmulh v20.4S, v24.4S, v11.s[3] -mul v24.4S, v24.4S,v12.s[3] -mla v26.4S, v7.4S, v31.s[0] -mla v3.4S, v8.4S, v31.s[0] -mla v25.4S, v9.4S, v31.s[0] -mla v24.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v30.4s -add v17.4s, v17.4s, v30.4s -sub v30.4s, v10.4s, v29.4s -add v10.4s, v10.4s, v29.4s -sub v29.4s, v27.4s, v22.4s -add v27.4s, v27.4s, v22.4s -sub v22.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -sub v23.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sub v26.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sub v3.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sub v25.4s, v16.4s, v24.4s -add v16.4s, v16.4s, v24.4s -str q17, [x0, #48] -str q20, [x0, #112] -str q10, [x0, #176] -str q30, [x0, #240] -str q27, [x0, #304] -str q29, [x0, #368] -str q18, [x0, #432] -str q22, [x0, #496] -str q28, [x0, #560] -str q23, [x0, #624] -str q19, [x0, #688] -str q26, [x0, #752] -str q21, [x0, #816] -str q3, [x0, #880] -str q16, [x0, #944] -str q25, [x0, #1008] -ldr q25, [x0, #768] -ldr q16, [x0, #832] -ldr q3, [x0, #896] -ldr q21, [x0, #960] -ldr q26, [x0, #256] -ldr q19, [x0, #320] -ldr q23, [x0, #384] -ldr q28, [x0, #448] -ldr q22, [x0, #512] -ldr q18, [x0, #576] -ldr q29, [x0, #640] -ldr q27, [x0, #704] -ldr q30, [x0, #0] -ldr q10, [x0, #64] -ldr q20, [x0, #128] -ldr q17, [x0, #192] -sqrdmulh v24.4S, v25.4S, v1.s[0] -mul v25.4S, v25.4S,v2.s[0] -sqrdmulh v9.4S, v16.4S, v1.s[0] -mul v16.4S, v16.4S,v2.s[0] -sqrdmulh v8.4S, v3.4S, v1.s[0] -mul v3.4S, v3.4S,v2.s[0] -sqrdmulh v7.4S, v21.4S, v1.s[0] -mul v21.4S, v21.4S,v2.s[0] -mla v25.4S, v24.4S, v31.s[0] -mla v16.4S, v9.4S, v31.s[0] -mla v3.4S, v8.4S, v31.s[0] -mla v21.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v2.s[0] -sqrdmulh v8.4S, v18.4S, v1.s[0] -mul v18.4S, v18.4S,v2.s[0] -sqrdmulh v9.4S, v29.4S, v1.s[0] -mul v29.4S, v29.4S,v2.s[0] -sqrdmulh v24.4S, v27.4S, v1.s[0] -mul v27.4S, v27.4S,v2.s[0] -mla v22.4S, v7.4S, v31.s[0] -mla v18.4S, v8.4S, v31.s[0] -mla v29.4S, v9.4S, v31.s[0] -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sub v25.4s, v19.4s, v16.4s -add v19.4s, v19.4s, v16.4s -sub v16.4s, v23.4s, v3.4s -add v23.4s, v23.4s, v3.4s -sub v3.4s, v28.4s, v21.4s -add v28.4s, v28.4s, v21.4s -sub v21.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sub v22.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sub v18.4s, v20.4s, v29.4s -add v20.4s, v20.4s, v29.4s -sub v29.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v23.4S, v1.s[1] -mul v23.4S, v23.4S,v2.s[1] -sqrdmulh v9.4S, v28.4S, v1.s[1] -mul v28.4S, v28.4S,v2.s[1] -sqrdmulh v8.4S, v26.4S, v1.s[1] -mul v26.4S, v26.4S,v2.s[1] -sqrdmulh v7.4S, v19.4S, v1.s[1] -mul v19.4S, v19.4S,v2.s[1] -mla v23.4S, v27.4S, v31.s[0] -mla v28.4S, v9.4S, v31.s[0] -mla v26.4S, v8.4S, v31.s[0] -mla v19.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v16.4S, v1.s[2] -mul v16.4S, v16.4S,v2.s[2] -sqrdmulh v8.4S, v3.4S, v1.s[2] -mul v3.4S, v3.4S,v2.s[2] -sqrdmulh v9.4S, v24.4S, v1.s[2] -mul v24.4S, v24.4S,v2.s[2] -sqrdmulh v27.4S, v25.4S, v1.s[2] -mul v25.4S, v25.4S,v2.s[2] -mla v16.4S, v7.4S, v31.s[0] -mla v3.4S, v8.4S, v31.s[0] -mla v24.4S, v9.4S, v31.s[0] -mla v25.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v23.4s -add v20.4s, v20.4s, v23.4s -sub v23.4s, v17.4s, v28.4s -add v17.4s, v17.4s, v28.4s -sub v28.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sub v26.4s, v10.4s, v19.4s -add v10.4s, v10.4s, v19.4s -sub v19.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sub v16.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sub v3.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sub v24.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v15.s[0] -mul v20.4S, v20.4S,v0.s[0] -sqrdmulh v9.4S, v17.4S, v15.s[0] -mul v17.4S, v17.4S,v0.s[0] -sqrdmulh v8.4S, v27.4S, v15.s[1] -mul v27.4S, v27.4S,v0.s[1] -sqrdmulh v7.4S, v23.4S, v15.s[1] -mul v23.4S, v23.4S,v0.s[1] -mla v20.4S, v25.4S, v31.s[0] -mla v17.4S, v9.4S, v31.s[0] -mla v27.4S, v8.4S, v31.s[0] -mla v23.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v18.4S, v15.s[2] -mul v18.4S, v18.4S,v0.s[2] -sqrdmulh v8.4S, v29.4S, v15.s[2] -mul v29.4S, v29.4S,v0.s[2] -sqrdmulh v9.4S, v19.4S, v15.s[3] -mul v19.4S, v19.4S,v0.s[3] -sqrdmulh v25.4S, v16.4S, v15.s[3] -mul v16.4S, v16.4S,v0.s[3] -mla v18.4S, v7.4S, v31.s[0] -mla v29.4S, v8.4S, v31.s[0] -mla v19.4S, v9.4S, v31.s[0] -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v20.4s -add v30.4s, v30.4s, v20.4s -sub v20.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sub v17.4s, v28.4s, v27.4s -add v28.4s, v28.4s, v27.4s -sub v27.4s, v26.4s, v23.4s -add v26.4s, v26.4s, v23.4s -sub v23.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sub v18.4s, v22.4s, v29.4s -add v22.4s, v22.4s, v29.4s -sub v29.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sub v19.4s, v24.4s, v16.4s -add v24.4s, v24.4s, v16.4s -sqrdmulh v16.4S, v10.4S, v13.s[0] -mul v10.4S, v10.4S,v14.s[0] -sqrdmulh v9.4S, v20.4S, v13.s[1] -mul v20.4S, v20.4S,v14.s[1] -sqrdmulh v8.4S, v26.4S, v13.s[2] -mul v26.4S, v26.4S,v14.s[2] -sqrdmulh v7.4S, v27.4S, v13.s[3] -mul v27.4S, v27.4S,v14.s[3] -mla v10.4S, v16.4S, v31.s[0] -mla v20.4S, v9.4S, v31.s[0] -mla v26.4S, v8.4S, v31.s[0] -mla v27.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v22.4S, v11.s[0] -mul v22.4S, v22.4S,v12.s[0] -sqrdmulh v8.4S, v18.4S, v11.s[1] -mul v18.4S, v18.4S,v12.s[1] -sqrdmulh v9.4S, v24.4S, v11.s[2] -mul v24.4S, v24.4S,v12.s[2] -sqrdmulh v16.4S, v19.4S, v11.s[3] -mul v19.4S, v19.4S,v12.s[3] -mla v22.4S, v7.4S, v31.s[0] -mla v18.4S, v8.4S, v31.s[0] -mla v24.4S, v9.4S, v31.s[0] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v30.4s, v10.4s -add v30.4s, v30.4s, v10.4s -sub v10.4s, v25.4s, v20.4s -add v25.4s, v25.4s, v20.4s -sub v20.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sub v26.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sub v27.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sub v22.4s, v23.4s, v18.4s -add v23.4s, v23.4s, v18.4s -sub v18.4s, v3.4s, v24.4s -add v3.4s, v3.4s, v24.4s -sub v24.4s, v29.4s, v19.4s -add v29.4s, v29.4s, v19.4s -str q30, [x0, #0] -str q16, [x0, #64] -str q25, [x0, #128] -str q10, [x0, #192] -str q28, [x0, #256] -str q20, [x0, #320] -str q17, [x0, #384] -str q26, [x0, #448] -str q21, [x0, #512] -str q27, [x0, #576] -str q23, [x0, #640] -str q22, [x0, #704] -str q3, [x0, #768] -str q18, [x0, #832] -str q29, [x0, #896] -str q24, [x0, #960] -ldr q24, [x0, #784] -ldr q29, [x0, #848] -ldr q18, [x0, #912] -ldr q3, [x0, #976] -ldr q22, [x0, #272] -ldr q23, [x0, #336] -ldr q27, [x0, #400] -ldr q21, [x0, #464] -ldr q26, [x0, #528] -ldr q17, [x0, #592] -ldr q20, [x0, #656] -ldr q28, [x0, #720] -ldr q10, [x0, #16] -ldr q25, [x0, #80] -ldr q16, [x0, #144] -ldr q30, [x0, #208] -sqrdmulh v19.4S, v24.4S, v1.s[0] -mul v24.4S, v24.4S,v2.s[0] -sqrdmulh v9.4S, v29.4S, v1.s[0] -mul v29.4S, v29.4S,v2.s[0] -sqrdmulh v8.4S, v18.4S, v1.s[0] -mul v18.4S, v18.4S,v2.s[0] -sqrdmulh v7.4S, v3.4S, v1.s[0] -mul v3.4S, v3.4S,v2.s[0] -mla v24.4S, v19.4S, v31.s[0] -mla v29.4S, v9.4S, v31.s[0] -mla v18.4S, v8.4S, v31.s[0] -mla v3.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v2.s[0] -sqrdmulh v8.4S, v17.4S, v1.s[0] -mul v17.4S, v17.4S,v2.s[0] -sqrdmulh v9.4S, v20.4S, v1.s[0] -mul v20.4S, v20.4S,v2.s[0] -sqrdmulh v19.4S, v28.4S, v1.s[0] -mul v28.4S, v28.4S,v2.s[0] -mla v26.4S, v7.4S, v31.s[0] -mla v17.4S, v8.4S, v31.s[0] -mla v20.4S, v9.4S, v31.s[0] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v24.4s -add v22.4s, v22.4s, v24.4s -sub v24.4s, v23.4s, v29.4s -add v23.4s, v23.4s, v29.4s -sub v29.4s, v27.4s, v18.4s -add v27.4s, v27.4s, v18.4s -sub v18.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sub v3.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -sub v26.4s, v25.4s, v17.4s -add v25.4s, v25.4s, v17.4s -sub v17.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sub v20.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v1.s[1] -mul v27.4S, v27.4S,v2.s[1] -sqrdmulh v9.4S, v21.4S, v1.s[1] -mul v21.4S, v21.4S,v2.s[1] -sqrdmulh v8.4S, v22.4S, v1.s[1] -mul v22.4S, v22.4S,v2.s[1] -sqrdmulh v7.4S, v23.4S, v1.s[1] -mul v23.4S, v23.4S,v2.s[1] -mla v27.4S, v28.4S, v31.s[0] -mla v21.4S, v9.4S, v31.s[0] -mla v22.4S, v8.4S, v31.s[0] -mla v23.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v29.4S, v1.s[2] -mul v29.4S, v29.4S,v2.s[2] -sqrdmulh v8.4S, v18.4S, v1.s[2] -mul v18.4S, v18.4S,v2.s[2] -sqrdmulh v9.4S, v19.4S, v1.s[2] -mul v19.4S, v19.4S,v2.s[2] -sqrdmulh v28.4S, v24.4S, v1.s[2] -mul v24.4S, v24.4S,v2.s[2] -mla v29.4S, v7.4S, v31.s[0] -mla v18.4S, v8.4S, v31.s[0] -mla v19.4S, v9.4S, v31.s[0] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sub v27.4s, v30.4s, v21.4s -add v30.4s, v30.4s, v21.4s -sub v21.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sub v22.4s, v25.4s, v23.4s -add v25.4s, v25.4s, v23.4s -sub v23.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -sub v29.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sub v18.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sub v19.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v16.4S, v15.s[0] -mul v16.4S, v16.4S,v0.s[0] -sqrdmulh v9.4S, v30.4S, v15.s[0] -mul v30.4S, v30.4S,v0.s[0] -sqrdmulh v8.4S, v28.4S, v15.s[1] -mul v28.4S, v28.4S,v0.s[1] -sqrdmulh v7.4S, v27.4S, v15.s[1] -mul v27.4S, v27.4S,v0.s[1] -mla v16.4S, v24.4S, v31.s[0] -mla v30.4S, v9.4S, v31.s[0] -mla v28.4S, v8.4S, v31.s[0] -mla v27.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v17.4S, v15.s[2] -mul v17.4S, v17.4S,v0.s[2] -sqrdmulh v8.4S, v20.4S, v15.s[2] -mul v20.4S, v20.4S,v0.s[2] -sqrdmulh v9.4S, v23.4S, v15.s[3] -mul v23.4S, v23.4S,v0.s[3] -sqrdmulh v24.4S, v29.4S, v15.s[3] -mul v29.4S, v29.4S,v0.s[3] -mla v17.4S, v7.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -mla v23.4S, v9.4S, v31.s[0] -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sub v16.4s, v25.4s, v30.4s -add v25.4s, v25.4s, v30.4s -sub v30.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sub v28.4s, v22.4s, v27.4s -add v22.4s, v22.4s, v27.4s -sub v27.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -sub v17.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sub v20.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -sub v23.4s, v19.4s, v29.4s -add v19.4s, v19.4s, v29.4s -sqrdmulh v29.4S, v25.4S, v13.s[0] -mul v25.4S, v25.4S,v14.s[0] -sqrdmulh v9.4S, v16.4S, v13.s[1] -mul v16.4S, v16.4S,v14.s[1] -sqrdmulh v8.4S, v22.4S, v13.s[2] -mul v22.4S, v22.4S,v14.s[2] -sqrdmulh v7.4S, v28.4S, v13.s[3] -mul v28.4S, v28.4S,v14.s[3] -mla v25.4S, v29.4S, v31.s[0] -mla v16.4S, v9.4S, v31.s[0] -mla v22.4S, v8.4S, v31.s[0] -mla v28.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v26.4S, v11.s[0] -mul v26.4S, v26.4S,v12.s[0] -sqrdmulh v8.4S, v17.4S, v11.s[1] -mul v17.4S, v17.4S,v12.s[1] -sqrdmulh v9.4S, v19.4S, v11.s[2] -mul v19.4S, v19.4S,v12.s[2] -sqrdmulh v29.4S, v23.4S, v11.s[3] -mul v23.4S, v23.4S,v12.s[3] -mla v26.4S, v7.4S, v31.s[0] -mla v17.4S, v8.4S, v31.s[0] -mla v19.4S, v9.4S, v31.s[0] -mla v23.4S, v29.4S, v31.s[0] -sub v29.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -sub v25.4s, v24.4s, v16.4s -add v24.4s, v24.4s, v16.4s -sub v16.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sub v22.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sub v28.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sub v26.4s, v27.4s, v17.4s -add v27.4s, v27.4s, v17.4s -sub v17.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sub v19.4s, v20.4s, v23.4s -add v20.4s, v20.4s, v23.4s -str q10, [x0, #16] -str q29, [x0, #80] -str q24, [x0, #144] -str q25, [x0, #208] -str q21, [x0, #272] -str q16, [x0, #336] -str q30, [x0, #400] -str q22, [x0, #464] -str q3, [x0, #528] -str q28, [x0, #592] -str q27, [x0, #656] -str q26, [x0, #720] -str q18, [x0, #784] -str q17, [x0, #848] -str q20, [x0, #912] -str q19, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q9, [x17, #+208] -ldr q23, [x17, #+224] -ldr q10, [x17, #+240] -ldr q29, [x0, #32] -ldr q24, [x0, #48] -ldr q25, [x0, #0] -ldr q21, [x0, #16] -sqrdmulh v16.4S, v29.4S, v5.s[0] -mul v29.4S, v29.4S,v4.s[0] -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v4.s[0] -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v21.4s -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v29.4S, v5.s[2] -mul v29.4S, v29.4S,v4.s[2] -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -trn1 v29.4S, v25.4S, v24.4S -trn2 v30.4S, v25.4S, v24.4S -trn1 v22.4S, v16.4S, v21.4S -trn2 v3.4S, v16.4S, v21.4S -trn2 v16.2D, v29.2D, v22.2D -trn2 v21.2D, v30.2D, v3.2D -trn1 v25.2D, v29.2D, v22.2D -trn1 v24.2D, v30.2D, v3.2D -sqrdmulh v3.4S, v16.4S, v7.4S -mul v16.4S, v16.4S,v6.4S -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v25.4s, v16.4s -add v25.4s, v25.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v24.4s, v21.4s -add v24.4s, v24.4s, v21.4s -sqrdmulh v21.4S, v24.4S, v9.4S -mul v24.4S, v24.4S,v8.4S -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v16.4S, v10.4S -mul v16.4S, v16.4S,v23.4S -mla v16.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -str q25, [x0, #0] -str q21, [x0, #16] -str q3, [x0, #32] -str q24, [x0, #48] -ldr q24, [x17, #+256] -ldr q3, [x17, #+272] -ldr q21, [x17, #+288] -ldr q25, [x17, #+304] -ldr q16, [x17, #+320] -ldr q30, [x17, #+336] -ldr q22, [x17, #+352] -ldr q29, [x17, #+368] -ldr q10, [x0, #96] -ldr q23, [x0, #112] -ldr q9, [x0, #64] -ldr q8, [x0, #80] -sqrdmulh v7.4S, v10.4S, v3.s[0] -mul v10.4S, v10.4S,v24.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sqrdmulh v10.4S, v23.4S, v3.s[0] -mul v23.4S, v23.4S,v24.s[0] -mla v23.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v23.4s -add v8.4s, v8.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v3.s[1] -mul v8.4S, v8.4S,v24.s[1] -mla v8.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v3.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -trn1 v10.4S, v9.4S, v23.4S -trn2 v6.4S, v9.4S, v23.4S -trn1 v5.4S, v7.4S, v8.4S -trn2 v4.4S, v7.4S, v8.4S -trn2 v7.2D, v10.2D, v5.2D -trn2 v8.2D, v6.2D, v4.2D -trn1 v9.2D, v10.2D, v5.2D -trn1 v23.2D, v6.2D, v4.2D -sqrdmulh v4.4S, v7.4S, v25.4S -mul v7.4S, v7.4S,v21.4S -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v25.4S -mul v8.4S, v8.4S,v21.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v23.4s, v8.4s -add v23.4s, v23.4s, v8.4s -sqrdmulh v8.4S, v23.4S, v30.4S -mul v23.4S, v23.4S,v16.4S -mla v23.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v7.4S, v29.4S -mul v7.4S, v7.4S,v22.4S -mla v7.4S, v23.4S, v31.s[0] -sub v23.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -str q9, [x0, #64] -str q8, [x0, #80] -str q4, [x0, #96] -str q23, [x0, #112] -ldr q23, [x17, #+384] -ldr q4, [x17, #+400] -ldr q8, [x17, #+416] -ldr q9, [x17, #+432] -ldr q7, [x17, #+448] -ldr q6, [x17, #+464] -ldr q5, [x17, #+480] -ldr q10, [x17, #+496] -ldr q29, [x0, #160] -ldr q22, [x0, #176] -ldr q30, [x0, #128] -ldr q16, [x0, #144] -sqrdmulh v25.4S, v29.4S, v4.s[0] -mul v29.4S, v29.4S,v23.s[0] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v4.s[0] -mul v22.4S, v22.4S,v23.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v22.4s -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v4.s[1] -mul v16.4S, v16.4S,v23.s[1] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v16.4s -add v30.4s, v30.4s, v16.4s -sqrdmulh v16.4S, v29.4S, v4.s[2] -mul v29.4S, v29.4S,v23.s[2] -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -trn1 v29.4S, v30.4S, v22.4S -trn2 v21.4S, v30.4S, v22.4S -trn1 v3.4S, v25.4S, v16.4S -trn2 v24.4S, v25.4S, v16.4S -trn2 v25.2D, v29.2D, v3.2D -trn2 v16.2D, v21.2D, v24.2D -trn1 v30.2D, v29.2D, v3.2D -trn1 v22.2D, v21.2D, v24.2D -sqrdmulh v24.4S, v25.4S, v9.4S -mul v25.4S, v25.4S,v8.4S -mla v25.4S, v24.4S, v31.s[0] -sub v24.4s, v30.4s, v25.4s -add v30.4s, v30.4s, v25.4s -sqrdmulh v25.4S, v16.4S, v9.4S -mul v16.4S, v16.4S,v8.4S -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v6.4S -mul v22.4S, v22.4S,v7.4S -mla v22.4S, v16.4S, v31.s[0] -sub v16.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v10.4S -mul v25.4S, v25.4S,v5.4S -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -str q30, [x0, #128] -str q16, [x0, #144] -str q24, [x0, #160] -str q22, [x0, #176] -ldr q22, [x17, #+512] -ldr q24, [x17, #+528] -ldr q16, [x17, #+544] -ldr q30, [x17, #+560] -ldr q25, [x17, #+576] -ldr q21, [x17, #+592] -ldr q3, [x17, #+608] -ldr q29, [x17, #+624] -ldr q10, [x0, #224] -ldr q5, [x0, #240] -ldr q6, [x0, #192] -ldr q7, [x0, #208] -sqrdmulh v9.4S, v10.4S, v24.s[0] -mul v10.4S, v10.4S,v22.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v24.s[0] -mul v5.4S, v5.4S,v22.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v24.s[1] -mul v7.4S, v7.4S,v22.s[1] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v10.4S, v24.s[2] -mul v10.4S, v10.4S,v22.s[2] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -trn1 v10.4S, v6.4S, v5.4S -trn2 v8.4S, v6.4S, v5.4S -trn1 v4.4S, v9.4S, v7.4S -trn2 v23.4S, v9.4S, v7.4S -trn2 v9.2D, v10.2D, v4.2D -trn2 v7.2D, v8.2D, v23.2D -trn1 v6.2D, v10.2D, v4.2D -trn1 v5.2D, v8.2D, v23.2D -sqrdmulh v23.4S, v9.4S, v30.4S -mul v9.4S, v9.4S,v16.4S -mla v9.4S, v23.4S, v31.s[0] -sub v23.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v30.4S -mul v7.4S, v7.4S,v16.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v21.4S -mul v5.4S, v5.4S,v25.4S -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v3.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v23.4s, v9.4s -add v23.4s, v23.4s, v9.4s -str q6, [x0, #192] -str q7, [x0, #208] -str q23, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q23, [x17, #+656] -ldr q7, [x17, #+672] -ldr q6, [x17, #+688] -ldr q9, [x17, #+704] -ldr q8, [x17, #+720] -ldr q4, [x17, #+736] -ldr q10, [x17, #+752] -ldr q29, [x0, #288] -ldr q3, [x0, #304] -ldr q21, [x0, #256] -ldr q25, [x0, #272] -sqrdmulh v30.4S, v29.4S, v23.s[0] -mul v29.4S, v29.4S,v5.s[0] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v3.4S, v23.s[0] -mul v3.4S, v3.4S,v5.s[0] -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v3.4s -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v23.s[1] -mul v25.4S, v25.4S,v5.s[1] -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v23.s[2] -mul v29.4S, v29.4S,v5.s[2] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -trn1 v29.4S, v21.4S, v3.4S -trn2 v16.4S, v21.4S, v3.4S -trn1 v24.4S, v30.4S, v25.4S -trn2 v22.4S, v30.4S, v25.4S -trn2 v30.2D, v29.2D, v24.2D -trn2 v25.2D, v16.2D, v22.2D -trn1 v21.2D, v29.2D, v24.2D -trn1 v3.2D, v16.2D, v22.2D -sqrdmulh v22.4S, v30.4S, v6.4S -mul v30.4S, v30.4S,v7.4S -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v30.4s -add v21.4s, v21.4s, v30.4s -sqrdmulh v30.4S, v25.4S, v6.4S -mul v25.4S, v25.4S,v7.4S -mla v25.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v25.4s -add v3.4s, v3.4s, v25.4s -sqrdmulh v25.4S, v3.4S, v8.4S -mul v3.4S, v3.4S,v9.4S -mla v3.4S, v25.4S, v31.s[0] -sub v25.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v10.4S -mul v30.4S, v30.4S,v4.4S -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -str q21, [x0, #256] -str q25, [x0, #272] -str q22, [x0, #288] -str q3, [x0, #304] -ldr q3, [x17, #+768] -ldr q22, [x17, #+784] -ldr q25, [x17, #+800] -ldr q21, [x17, #+816] -ldr q30, [x17, #+832] -ldr q16, [x17, #+848] -ldr q24, [x17, #+864] -ldr q29, [x17, #+880] -ldr q10, [x0, #352] -ldr q4, [x0, #368] -ldr q8, [x0, #320] -ldr q9, [x0, #336] -sqrdmulh v6.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v3.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v22.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v4.4s -add v9.4s, v9.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v22.s[1] -mul v9.4S, v9.4S,v3.s[1] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v22.s[2] -mul v10.4S, v10.4S,v3.s[2] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -trn1 v10.4S, v8.4S, v4.4S -trn2 v7.4S, v8.4S, v4.4S -trn1 v23.4S, v6.4S, v9.4S -trn2 v5.4S, v6.4S, v9.4S -trn2 v6.2D, v10.2D, v23.2D -trn2 v9.2D, v7.2D, v5.2D -trn1 v8.2D, v10.2D, v23.2D -trn1 v4.2D, v7.2D, v5.2D -sqrdmulh v5.4S, v6.4S, v21.4S -mul v6.4S, v6.4S,v25.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v21.4S -mul v9.4S, v9.4S,v25.4S -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v9.4s -add v4.4s, v4.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v16.4S -mul v4.4S, v4.4S,v30.4S -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v29.4S -mul v6.4S, v6.4S,v24.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -str q8, [x0, #320] -str q9, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q9, [x17, #+928] -ldr q8, [x17, #+944] -ldr q6, [x17, #+960] -ldr q7, [x17, #+976] -ldr q23, [x17, #+992] -ldr q10, [x17, #+1008] -ldr q29, [x0, #416] -ldr q24, [x0, #432] -ldr q16, [x0, #384] -ldr q30, [x0, #400] -sqrdmulh v21.4S, v29.4S, v5.s[0] -mul v29.4S, v29.4S,v4.s[0] -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v4.s[0] -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v24.4s -add v30.4s, v30.4s, v24.4s -sqrdmulh v24.4S, v30.4S, v5.s[1] -mul v30.4S, v30.4S,v4.s[1] -mla v30.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v30.4s -add v16.4s, v16.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v5.s[2] -mul v29.4S, v29.4S,v4.s[2] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -trn1 v29.4S, v16.4S, v24.4S -trn2 v25.4S, v16.4S, v24.4S -trn1 v22.4S, v21.4S, v30.4S -trn2 v3.4S, v21.4S, v30.4S -trn2 v21.2D, v29.2D, v22.2D -trn2 v30.2D, v25.2D, v3.2D -trn1 v16.2D, v29.2D, v22.2D -trn1 v24.2D, v25.2D, v3.2D -sqrdmulh v3.4S, v21.4S, v8.4S -mul v21.4S, v21.4S,v9.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v8.4S -mul v30.4S, v30.4S,v9.4S -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v30.4s -add v24.4s, v24.4s, v30.4s -sqrdmulh v30.4S, v24.4S, v7.4S -mul v24.4S, v24.4S,v6.4S -mla v24.4S, v30.4S, v31.s[0] -sub v30.4s, v16.4s, v24.4s -add v16.4s, v16.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v10.4S -mul v21.4S, v21.4S,v23.4S -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -str q16, [x0, #384] -str q30, [x0, #400] -str q3, [x0, #416] -str q24, [x0, #432] -ldr q24, [x17, #+1024] -ldr q3, [x17, #+1040] -ldr q30, [x17, #+1056] -ldr q16, [x17, #+1072] -ldr q21, [x17, #+1088] -ldr q25, [x17, #+1104] -ldr q22, [x17, #+1120] -ldr q29, [x17, #+1136] -ldr q10, [x0, #480] -ldr q23, [x0, #496] -ldr q7, [x0, #448] -ldr q6, [x0, #464] -sqrdmulh v8.4S, v10.4S, v3.s[0] -mul v10.4S, v10.4S,v24.s[0] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -sqrdmulh v10.4S, v23.4S, v3.s[0] -mul v23.4S, v23.4S,v24.s[0] -mla v23.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v23.4s -add v6.4s, v6.4s, v23.4s -sqrdmulh v23.4S, v6.4S, v3.s[1] -mul v6.4S, v6.4S,v24.s[1] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v3.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -trn1 v10.4S, v7.4S, v23.4S -trn2 v9.4S, v7.4S, v23.4S -trn1 v5.4S, v8.4S, v6.4S -trn2 v4.4S, v8.4S, v6.4S -trn2 v8.2D, v10.2D, v5.2D -trn2 v6.2D, v9.2D, v4.2D -trn1 v7.2D, v10.2D, v5.2D -trn1 v23.2D, v9.2D, v4.2D -sqrdmulh v4.4S, v8.4S, v16.4S -mul v8.4S, v8.4S,v30.4S -mla v8.4S, v4.4S, v31.s[0] -sub v4.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v16.4S -mul v6.4S, v6.4S,v30.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v6.4s -add v23.4s, v23.4s, v6.4s -sqrdmulh v6.4S, v23.4S, v25.4S -mul v23.4S, v23.4S,v21.4S -mla v23.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v23.4s -add v7.4s, v7.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v29.4S -mul v8.4S, v8.4S,v22.4S -mla v8.4S, v23.4S, v31.s[0] -sub v23.4s, v4.4s, v8.4s -add v4.4s, v4.4s, v8.4s -str q7, [x0, #448] -str q6, [x0, #464] -str q4, [x0, #480] -str q23, [x0, #496] -ldr q23, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q7, [x17, #+1200] -ldr q8, [x17, #+1216] -ldr q9, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q10, [x17, #+1264] -ldr q29, [x0, #544] -ldr q22, [x0, #560] -ldr q25, [x0, #512] -ldr q21, [x0, #528] -sqrdmulh v16.4S, v29.4S, v4.s[0] -mul v29.4S, v29.4S,v23.s[0] -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v4.s[0] -mul v22.4S, v22.4S,v23.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v4.s[1] -mul v21.4S, v21.4S,v23.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v21.4s -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v29.4S, v4.s[2] -mul v29.4S, v29.4S,v23.s[2] -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -trn1 v29.4S, v25.4S, v22.4S -trn2 v30.4S, v25.4S, v22.4S -trn1 v3.4S, v16.4S, v21.4S -trn2 v24.4S, v16.4S, v21.4S -trn2 v16.2D, v29.2D, v3.2D -trn2 v21.2D, v30.2D, v24.2D -trn1 v25.2D, v29.2D, v3.2D -trn1 v22.2D, v30.2D, v24.2D -sqrdmulh v24.4S, v16.4S, v7.4S -mul v16.4S, v16.4S,v6.4S -mla v16.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v16.4s -add v25.4s, v25.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v7.4S -mul v21.4S, v21.4S,v6.4S -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v9.4S -mul v22.4S, v22.4S,v8.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v25.4s, v22.4s -add v25.4s, v25.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v10.4S -mul v16.4S, v16.4S,v5.4S -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v16.4s -add v24.4s, v24.4s, v16.4s -str q25, [x0, #512] -str q21, [x0, #528] -str q24, [x0, #544] -str q22, [x0, #560] -ldr q22, [x17, #+1280] -ldr q24, [x17, #+1296] -ldr q21, [x17, #+1312] -ldr q25, [x17, #+1328] -ldr q16, [x17, #+1344] -ldr q30, [x17, #+1360] -ldr q3, [x17, #+1376] -ldr q29, [x17, #+1392] -ldr q10, [x0, #608] -ldr q5, [x0, #624] -ldr q9, [x0, #576] -ldr q8, [x0, #592] -sqrdmulh v7.4S, v10.4S, v24.s[0] -mul v10.4S, v10.4S,v22.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v24.s[0] -mul v5.4S, v5.4S,v22.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v5.4s -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v24.s[1] -mul v8.4S, v8.4S,v22.s[1] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v24.s[2] -mul v10.4S, v10.4S,v22.s[2] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -trn1 v10.4S, v9.4S, v5.4S -trn2 v6.4S, v9.4S, v5.4S -trn1 v4.4S, v7.4S, v8.4S -trn2 v23.4S, v7.4S, v8.4S -trn2 v7.2D, v10.2D, v4.2D -trn2 v8.2D, v6.2D, v23.2D -trn1 v9.2D, v10.2D, v4.2D -trn1 v5.2D, v6.2D, v23.2D -sqrdmulh v23.4S, v7.4S, v25.4S -mul v7.4S, v7.4S,v21.4S -mla v7.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v25.4S -mul v8.4S, v8.4S,v21.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v30.4S -mul v5.4S, v5.4S,v16.4S -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v29.4S -mul v7.4S, v7.4S,v3.4S -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v23.4s, v7.4s -add v23.4s, v23.4s, v7.4s -str q9, [x0, #576] -str q8, [x0, #592] -str q23, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q23, [x17, #+1424] -ldr q8, [x17, #+1440] -ldr q9, [x17, #+1456] -ldr q7, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q10, [x17, #+1520] -ldr q29, [x0, #672] -ldr q3, [x0, #688] -ldr q30, [x0, #640] -ldr q16, [x0, #656] -sqrdmulh v25.4S, v29.4S, v23.s[0] -mul v29.4S, v29.4S,v5.s[0] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v3.4S, v23.s[0] -mul v3.4S, v3.4S,v5.s[0] -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v23.s[1] -mul v16.4S, v16.4S,v5.s[1] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v30.4s, v16.4s -add v30.4s, v30.4s, v16.4s -sqrdmulh v16.4S, v29.4S, v23.s[2] -mul v29.4S, v29.4S,v5.s[2] -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -trn1 v29.4S, v30.4S, v3.4S -trn2 v21.4S, v30.4S, v3.4S -trn1 v24.4S, v25.4S, v16.4S -trn2 v22.4S, v25.4S, v16.4S -trn2 v25.2D, v29.2D, v24.2D -trn2 v16.2D, v21.2D, v22.2D -trn1 v30.2D, v29.2D, v24.2D -trn1 v3.2D, v21.2D, v22.2D -sqrdmulh v22.4S, v25.4S, v9.4S -mul v25.4S, v25.4S,v8.4S -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v30.4s, v25.4s -add v30.4s, v30.4s, v25.4s -sqrdmulh v25.4S, v16.4S, v9.4S -mul v16.4S, v16.4S,v8.4S -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v6.4S -mul v3.4S, v3.4S,v7.4S -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v30.4s, v3.4s -add v30.4s, v30.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v10.4S -mul v25.4S, v25.4S,v4.4S -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -str q30, [x0, #640] -str q16, [x0, #656] -str q22, [x0, #672] -str q3, [x0, #688] -ldr q3, [x17, #+1536] -ldr q22, [x17, #+1552] -ldr q16, [x17, #+1568] -ldr q30, [x17, #+1584] -ldr q25, [x17, #+1600] -ldr q21, [x17, #+1616] -ldr q24, [x17, #+1632] -ldr q29, [x17, #+1648] -ldr q10, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q7, [x0, #720] -sqrdmulh v9.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v3.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v22.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v22.s[1] -mul v7.4S, v7.4S,v3.s[1] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v10.4S, v22.s[2] -mul v10.4S, v10.4S,v3.s[2] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -trn1 v10.4S, v6.4S, v4.4S -trn2 v8.4S, v6.4S, v4.4S -trn1 v23.4S, v9.4S, v7.4S -trn2 v5.4S, v9.4S, v7.4S -trn2 v9.2D, v10.2D, v23.2D -trn2 v7.2D, v8.2D, v5.2D -trn1 v6.2D, v10.2D, v23.2D -trn1 v4.2D, v8.2D, v5.2D -sqrdmulh v5.4S, v9.4S, v30.4S -mul v9.4S, v9.4S,v16.4S -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v30.4S -mul v7.4S, v7.4S,v16.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v4.4s, v7.4s -add v4.4s, v4.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v21.4S -mul v4.4S, v4.4S,v25.4S -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v24.4S -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -str q6, [x0, #704] -str q7, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q7, [x17, #+1696] -ldr q6, [x17, #+1712] -ldr q9, [x17, #+1728] -ldr q8, [x17, #+1744] -ldr q23, [x17, #+1760] -ldr q10, [x17, #+1776] -ldr q29, [x0, #800] -ldr q24, [x0, #816] -ldr q21, [x0, #768] -ldr q25, [x0, #784] -sqrdmulh v30.4S, v29.4S, v5.s[0] -mul v29.4S, v29.4S,v4.s[0] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v4.s[0] -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v25.4S, v5.s[1] -mul v25.4S, v25.4S,v4.s[1] -mla v25.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v5.s[2] -mul v29.4S, v29.4S,v4.s[2] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -trn1 v29.4S, v21.4S, v24.4S -trn2 v16.4S, v21.4S, v24.4S -trn1 v22.4S, v30.4S, v25.4S -trn2 v3.4S, v30.4S, v25.4S -trn2 v30.2D, v29.2D, v22.2D -trn2 v25.2D, v16.2D, v3.2D -trn1 v21.2D, v29.2D, v22.2D -trn1 v24.2D, v16.2D, v3.2D -sqrdmulh v3.4S, v30.4S, v6.4S -mul v30.4S, v30.4S,v7.4S -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v30.4s -add v21.4s, v21.4s, v30.4s -sqrdmulh v30.4S, v25.4S, v6.4S -mul v25.4S, v25.4S,v7.4S -mla v25.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v8.4S -mul v24.4S, v24.4S,v9.4S -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v30.4S, v10.4S -mul v30.4S, v30.4S,v23.4S -mla v30.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v30.4s -add v3.4s, v3.4s, v30.4s -str q21, [x0, #768] -str q25, [x0, #784] -str q3, [x0, #800] -str q24, [x0, #816] -ldr q24, [x17, #+1792] -ldr q3, [x17, #+1808] -ldr q25, [x17, #+1824] -ldr q21, [x17, #+1840] -ldr q30, [x17, #+1856] -ldr q16, [x17, #+1872] -ldr q22, [x17, #+1888] -ldr q29, [x17, #+1904] -ldr q10, [x0, #864] -ldr q23, [x0, #880] -ldr q8, [x0, #832] -ldr q9, [x0, #848] -sqrdmulh v6.4S, v10.4S, v3.s[0] -mul v10.4S, v10.4S,v24.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v23.4S, v3.s[0] -mul v23.4S, v23.4S,v24.s[0] -mla v23.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v9.4S, v3.s[1] -mul v9.4S, v9.4S,v24.s[1] -mla v9.4S, v23.4S, v31.s[0] -sub v23.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v3.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -trn1 v10.4S, v8.4S, v23.4S -trn2 v7.4S, v8.4S, v23.4S -trn1 v5.4S, v6.4S, v9.4S -trn2 v4.4S, v6.4S, v9.4S -trn2 v6.2D, v10.2D, v5.2D -trn2 v9.2D, v7.2D, v4.2D -trn1 v8.2D, v10.2D, v5.2D -trn1 v23.2D, v7.2D, v4.2D -sqrdmulh v4.4S, v6.4S, v21.4S -mul v6.4S, v6.4S,v25.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v21.4S -mul v9.4S, v9.4S,v25.4S -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v23.4s, v9.4s -add v23.4s, v23.4s, v9.4s -sqrdmulh v9.4S, v23.4S, v16.4S -mul v23.4S, v23.4S,v30.4S -mla v23.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v23.4s -add v8.4s, v8.4s, v23.4s -sqrdmulh v23.4S, v6.4S, v29.4S -mul v6.4S, v6.4S,v22.4S -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -str q8, [x0, #832] -str q9, [x0, #848] -str q4, [x0, #864] -str q23, [x0, #880] -ldr q23, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q9, [x17, #+1952] -ldr q8, [x17, #+1968] -ldr q6, [x17, #+1984] -ldr q7, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q10, [x17, #+2032] -ldr q29, [x0, #928] -ldr q22, [x0, #944] -ldr q16, [x0, #896] -ldr q30, [x0, #912] -sqrdmulh v21.4S, v29.4S, v4.s[0] -mul v29.4S, v29.4S,v23.s[0] -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v4.s[0] -mul v22.4S, v22.4S,v23.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v4.s[1] -mul v30.4S, v30.4S,v23.s[1] -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v30.4s -add v16.4s, v16.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v4.s[2] -mul v29.4S, v29.4S,v23.s[2] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -trn1 v29.4S, v16.4S, v22.4S -trn2 v25.4S, v16.4S, v22.4S -trn1 v3.4S, v21.4S, v30.4S -trn2 v24.4S, v21.4S, v30.4S -trn2 v21.2D, v29.2D, v3.2D -trn2 v30.2D, v25.2D, v24.2D -trn1 v16.2D, v29.2D, v3.2D -trn1 v22.2D, v25.2D, v24.2D -sqrdmulh v24.4S, v21.4S, v8.4S -mul v21.4S, v21.4S,v9.4S -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v8.4S -mul v30.4S, v30.4S,v9.4S -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v7.4S -mul v22.4S, v22.4S,v6.4S -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v16.4s, v22.4s -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v10.4S -mul v21.4S, v21.4S,v5.4S -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v21.4s -add v24.4s, v24.4s, v21.4s -str q16, [x0, #896] -str q30, [x0, #912] -str q24, [x0, #928] -str q22, [x0, #944] -ldr q22, [x17, #+2048] -ldr q24, [x17, #+2064] -ldr q30, [x17, #+2080] -ldr q16, [x17, #+2096] -ldr q21, [x17, #+2112] -ldr q25, [x17, #+2128] -ldr q3, [x17, #+2144] -ldr q29, [x17, #+2160] -ldr q10, [x0, #992] -ldr q5, [x0, #1008] -ldr q7, [x0, #960] -ldr q6, [x0, #976] -sqrdmulh v8.4S, v10.4S, v24.s[0] -mul v10.4S, v10.4S,v22.s[0] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v24.s[0] -mul v5.4S, v5.4S,v22.s[0] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v24.s[1] -mul v6.4S, v6.4S,v22.s[1] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v10.4S, v24.s[2] -mul v10.4S, v10.4S,v22.s[2] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -trn1 v10.4S, v7.4S, v5.4S -trn2 v9.4S, v7.4S, v5.4S -trn1 v4.4S, v8.4S, v6.4S -trn2 v23.4S, v8.4S, v6.4S -trn2 v8.2D, v10.2D, v4.2D -trn2 v6.2D, v9.2D, v23.2D -trn1 v7.2D, v10.2D, v4.2D -trn1 v5.2D, v9.2D, v23.2D -sqrdmulh v23.4S, v8.4S, v16.4S -mul v8.4S, v8.4S,v30.4S -mla v8.4S, v23.4S, v31.s[0] -sub v23.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v6.4S, v16.4S -mul v6.4S, v6.4S,v30.4S -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v25.4S -mul v5.4S, v5.4S,v21.4S -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v29.4S -mul v8.4S, v8.4S,v3.4S -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v23.4s, v8.4s -add v23.4s, v23.4s, v8.4s -str q7, [x0, #960] -str q6, [x0, #976] -str q23, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s deleted file mode 100644 index b0b458f..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_6_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_6_0 -.global _ntt_u32_full_neon_asm_var_4_4_6_0 -ntt_u32_full_neon_asm_var_4_4_6_0: -_ntt_u32_full_neon_asm_var_4_4_6_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #800] -ldr q29, [x0, #864] -ldr q28, [x0, #928] -ldr q27, [x0, #992] -ldr q26, [x0, #288] -ldr q25, [x0, #352] -ldr q24, [x0, #416] -ldr q23, [x0, #480] -ldr q22, [x17, #+0] -ldr q21, [x17, #+16] -ldr q20, [x17, #+32] -ldr q19, [x17, #+48] -ldr q18, [x17, #+64] -ldr q17, [x17, #+80] -ldr q16, [x17, #+96] -ldr q3, [x17, #+112] -sqrdmulh v2.4S, v30.4S, v21.s[0] -ldr q1, [x0, #544] -ldr q0, [x0, #608] -mul v30.4S, v30.4S,v22.s[0] -ldr q15, [x0, #672] -ldr q14, [x0, #736] -sqrdmulh v13.4S, v29.4S, v21.s[0] -ldr q12, [x0, #32] -mul v29.4S, v29.4S,v22.s[0] -ldr q11, [x0, #96] -sqrdmulh v10.4S, v28.4S, v21.s[0] -ldr q9, [x0, #160] -mul v28.4S, v28.4S,v22.s[0] -ldr q8, [x0, #224] -sqrdmulh v7.4S, v27.4S, v21.s[0] -mul v27.4S, v27.4S,v22.s[0] -mla v30.4S, v2.4S, v31.s[0] -mla v29.4S, v13.4S, v31.s[0] -mla v28.4S, v10.4S, v31.s[0] -mla v27.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v1.4S, v21.s[0] -mul v1.4S, v1.4S,v22.s[0] -sub v10.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v0.4S, v21.s[0] -mul v0.4S, v0.4S,v22.s[0] -sub v13.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -sqrdmulh v29.4S, v15.4S, v21.s[0] -mul v15.4S, v15.4S,v22.s[0] -sub v2.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v14.4S, v21.s[0] -mul v14.4S, v14.4S,v22.s[0] -mla v1.4S, v7.4S, v31.s[0] -sub v7.4s, v23.4s, v27.4s -mla v0.4S, v30.4S, v31.s[0] -add v23.4s, v23.4s, v27.4s -mla v15.4S, v29.4S, v31.s[0] -mla v14.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v24.4S, v21.s[1] -mul v24.4S, v24.4S,v22.s[1] -sub v29.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v23.4S, v21.s[1] -mul v23.4S, v23.4S,v22.s[1] -sub v27.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v26.4S, v21.s[1] -mul v26.4S, v26.4S,v22.s[1] -sub v30.4s, v9.4s, v15.4s -add v9.4s, v9.4s, v15.4s -sqrdmulh v15.4S, v25.4S, v21.s[1] -mul v25.4S, v25.4S,v22.s[1] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v14.4s -add v8.4s, v8.4s, v14.4s -mla v23.4S, v1.4S, v31.s[0] -mla v26.4S, v0.4S, v31.s[0] -mla v25.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v2.4S, v21.s[2] -mul v2.4S, v2.4S,v22.s[2] -sub v0.4s, v9.4s, v24.4s -add v9.4s, v9.4s, v24.4s -sqrdmulh v24.4S, v7.4S, v21.s[2] -mul v7.4S, v7.4S,v22.s[2] -sub v1.4s, v8.4s, v23.4s -add v8.4s, v8.4s, v23.4s -sqrdmulh v23.4S, v10.4S, v21.s[2] -mul v10.4S, v10.4S,v22.s[2] -sub v14.4s, v12.4s, v26.4s -add v12.4s, v12.4s, v26.4s -sqrdmulh v26.4S, v13.4S, v21.s[2] -mul v13.4S, v13.4S,v22.s[2] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v25.4s -mla v7.4S, v24.4S, v31.s[0] -add v11.4s, v11.4s, v25.4s -mla v10.4S, v23.4S, v31.s[0] -mla v13.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v9.4S, v19.s[0] -mul v9.4S, v9.4S,v20.s[0] -sub v23.4s, v30.4s, v2.4s -add v30.4s, v30.4s, v2.4s -sqrdmulh v2.4S, v8.4S, v19.s[0] -mul v8.4S, v8.4S,v20.s[0] -sub v25.4s, v28.4s, v7.4s -add v28.4s, v28.4s, v7.4s -sqrdmulh v7.4S, v0.4S, v19.s[1] -mul v0.4S, v0.4S,v20.s[1] -sub v24.4s, v29.4s, v10.4s -add v29.4s, v29.4s, v10.4s -sqrdmulh v10.4S, v1.4S, v19.s[1] -mul v1.4S, v1.4S,v20.s[1] -mla v9.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v8.4S, v2.4S, v31.s[0] -mla v0.4S, v7.4S, v31.s[0] -mla v1.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v30.4S, v19.s[2] -mul v30.4S, v30.4S,v20.s[2] -sub v7.4s, v12.4s, v9.4s -add v12.4s, v12.4s, v9.4s -sqrdmulh v9.4S, v28.4S, v19.s[2] -mul v28.4S, v28.4S,v20.s[2] -sub v2.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -sqrdmulh v8.4S, v23.4S, v19.s[3] -mul v23.4S, v23.4S,v20.s[3] -sub v13.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v25.4S, v19.s[3] -mul v25.4S, v25.4S,v20.s[3] -mla v30.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v1.4s -mla v28.4S, v9.4S, v31.s[0] -add v15.4s, v15.4s, v1.4s -mla v23.4S, v8.4S, v31.s[0] -mla v25.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v11.4S, v17.s[0] -mul v11.4S, v11.4S,v18.s[0] -sub v8.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v2.4S, v17.s[1] -mul v2.4S, v2.4S,v18.s[1] -sub v1.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v17.s[2] -mul v15.4S, v15.4S,v18.s[2] -sub v9.4s, v24.4s, v23.4s -add v24.4s, v24.4s, v23.4s -sqrdmulh v23.4S, v10.4S, v17.s[3] -mul v10.4S, v10.4S,v18.s[3] -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -mla v2.4S, v30.4S, v31.s[0] -mla v15.4S, v28.4S, v31.s[0] -mla v10.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v27.4S, v3.s[0] -mul v27.4S, v27.4S,v16.s[0] -sub v28.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -str q12, [x0, #32] -str q28, [x0, #96] -sqrdmulh v28.4S, v1.4S, v3.s[1] -mul v1.4S, v1.4S,v16.s[1] -ldr q12, [x0, #816] -ldr q11, [x0, #880] -sub v30.4s, v7.4s, v2.4s -add v7.4s, v7.4s, v2.4s -str q7, [x0, #160] -str q30, [x0, #224] -sqrdmulh v30.4S, v26.4S, v3.s[2] -mul v26.4S, v26.4S,v16.s[2] -ldr q7, [x0, #944] -ldr q2, [x0, #1008] -sub v25.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -str q14, [x0, #288] -str q25, [x0, #352] -sqrdmulh v25.4S, v0.4S, v3.s[3] -mul v0.4S, v0.4S,v16.s[3] -ldr q14, [x0, #304] -ldr q15, [x0, #368] -mla v27.4S, v23.4S, v31.s[0] -sub v23.4s, v13.4s, v10.4s -mla v1.4S, v28.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -str q13, [x0, #416] -str q23, [x0, #480] -mla v26.4S, v30.4S, v31.s[0] -ldr q30, [x0, #432] -ldr q23, [x0, #496] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sub v27.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -sub v1.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -str q29, [x0, #544] -str q25, [x0, #608] -str q8, [x0, #672] -str q27, [x0, #736] -str q24, [x0, #800] -str q1, [x0, #864] -sqrdmulh v1.4S, v12.4S, v21.s[0] -ldr q24, [x0, #560] -ldr q27, [x0, #624] -mul v12.4S, v12.4S,v22.s[0] -sub v8.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -str q9, [x0, #928] -str q8, [x0, #992] -ldr q8, [x0, #688] -ldr q9, [x0, #752] -sqrdmulh v0.4S, v11.4S, v21.s[0] -ldr q25, [x0, #48] -mul v11.4S, v11.4S,v22.s[0] -ldr q29, [x0, #112] -sqrdmulh v26.4S, v7.4S, v21.s[0] -ldr q13, [x0, #176] -mul v7.4S, v7.4S,v22.s[0] -ldr q10, [x0, #240] -sqrdmulh v28.4S, v2.4S, v21.s[0] -mul v2.4S, v2.4S,v22.s[0] -mla v12.4S, v1.4S, v31.s[0] -mla v11.4S, v0.4S, v31.s[0] -mla v7.4S, v26.4S, v31.s[0] -mla v2.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v24.4S, v21.s[0] -mul v24.4S, v24.4S,v22.s[0] -sub v26.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v21.s[0] -mul v27.4S, v27.4S,v22.s[0] -sub v0.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v21.s[0] -mul v8.4S, v8.4S,v22.s[0] -sub v1.4s, v30.4s, v7.4s -add v30.4s, v30.4s, v7.4s -sqrdmulh v7.4S, v9.4S, v21.s[0] -mul v9.4S, v9.4S,v22.s[0] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v23.4s, v2.4s -mla v27.4S, v12.4S, v31.s[0] -add v23.4s, v23.4s, v2.4s -mla v8.4S, v11.4S, v31.s[0] -mla v9.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v30.4S, v21.s[1] -mul v30.4S, v30.4S,v22.s[1] -sub v11.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v23.4S, v21.s[1] -mul v23.4S, v23.4S,v22.s[1] -sub v2.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v14.4S, v21.s[1] -mul v14.4S, v14.4S,v22.s[1] -sub v12.4s, v13.4s, v8.4s -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v15.4S, v21.s[1] -mul v15.4S, v15.4S,v22.s[1] -mla v30.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -mla v23.4S, v24.4S, v31.s[0] -mla v14.4S, v27.4S, v31.s[0] -mla v15.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v1.4S, v21.s[2] -mul v1.4S, v1.4S,v22.s[2] -sub v27.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v21.s[2] -mul v28.4S, v28.4S,v22.s[2] -sub v24.4s, v10.4s, v23.4s -add v10.4s, v10.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v21.s[2] -mul v26.4S, v26.4S,v22.s[2] -sub v9.4s, v25.4s, v14.4s -add v25.4s, v25.4s, v14.4s -sqrdmulh v14.4S, v0.4S, v21.s[2] -mul v0.4S, v0.4S,v22.s[2] -mla v1.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v15.4s -mla v28.4S, v30.4S, v31.s[0] -add v29.4s, v29.4s, v15.4s -mla v26.4S, v23.4S, v31.s[0] -mla v0.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v13.4S, v19.s[0] -mul v13.4S, v13.4S,v20.s[0] -sub v23.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v19.s[0] -mul v10.4S, v10.4S,v20.s[0] -sub v15.4s, v7.4s, v28.4s -add v7.4s, v7.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v19.s[1] -mul v27.4S, v27.4S,v20.s[1] -sub v30.4s, v11.4s, v26.4s -add v11.4s, v11.4s, v26.4s -sqrdmulh v26.4S, v24.4S, v19.s[1] -mul v24.4S, v24.4S,v20.s[1] -mla v13.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -mla v10.4S, v1.4S, v31.s[0] -mla v27.4S, v28.4S, v31.s[0] -mla v24.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v12.4S, v19.s[2] -mul v12.4S, v12.4S,v20.s[2] -sub v28.4s, v25.4s, v13.4s -add v25.4s, v25.4s, v13.4s -sqrdmulh v13.4S, v7.4S, v19.s[2] -mul v7.4S, v7.4S,v20.s[2] -sub v1.4s, v29.4s, v10.4s -add v29.4s, v29.4s, v10.4s -sqrdmulh v10.4S, v23.4S, v19.s[3] -mul v23.4S, v23.4S,v20.s[3] -sub v0.4s, v9.4s, v27.4s -add v9.4s, v9.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v19.s[3] -mul v15.4S, v15.4S,v20.s[3] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v8.4s, v24.4s -mla v7.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v24.4s -mla v23.4S, v10.4S, v31.s[0] -mla v15.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v29.4S, v17.s[0] -mul v29.4S, v29.4S,v18.s[0] -sub v10.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v17.s[1] -mul v1.4S, v1.4S,v18.s[1] -sub v24.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v17.s[2] -mul v8.4S, v8.4S,v18.s[2] -sub v13.4s, v30.4s, v23.4s -add v30.4s, v30.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v17.s[3] -mul v26.4S, v26.4S,v18.s[3] -mla v29.4S, v27.4S, v31.s[0] -sub v27.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -mla v1.4S, v12.4S, v31.s[0] -mla v8.4S, v7.4S, v31.s[0] -mla v26.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v2.4S, v3.s[0] -mul v2.4S, v2.4S,v16.s[0] -sub v7.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -str q25, [x0, #48] -str q7, [x0, #112] -sqrdmulh v7.4S, v24.4S, v3.s[1] -mul v24.4S, v24.4S,v16.s[1] -ldr q25, [x0, #768] -ldr q29, [x0, #832] -sub v12.4s, v28.4s, v1.4s -add v28.4s, v28.4s, v1.4s -str q28, [x0, #176] -str q12, [x0, #240] -sqrdmulh v12.4S, v14.4S, v3.s[2] -mul v14.4S, v14.4S,v16.s[2] -ldr q28, [x0, #896] -ldr q1, [x0, #960] -sub v15.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -str q9, [x0, #304] -str q15, [x0, #368] -sqrdmulh v15.4S, v27.4S, v3.s[3] -mul v27.4S, v27.4S,v16.s[3] -ldr q9, [x0, #256] -ldr q8, [x0, #320] -mla v2.4S, v23.4S, v31.s[0] -sub v23.4s, v0.4s, v26.4s -mla v24.4S, v7.4S, v31.s[0] -add v0.4s, v0.4s, v26.4s -str q0, [x0, #432] -str q23, [x0, #496] -mla v14.4S, v12.4S, v31.s[0] -ldr q12, [x0, #384] -ldr q23, [x0, #448] -mla v27.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sub v2.4s, v10.4s, v24.4s -add v10.4s, v10.4s, v24.4s -sub v24.4s, v30.4s, v14.4s -add v30.4s, v30.4s, v14.4s -str q11, [x0, #560] -str q15, [x0, #624] -str q10, [x0, #688] -str q2, [x0, #752] -str q30, [x0, #816] -str q24, [x0, #880] -sqrdmulh v24.4S, v25.4S, v21.s[0] -ldr q30, [x0, #512] -ldr q2, [x0, #576] -mul v25.4S, v25.4S,v22.s[0] -sub v10.4s, v13.4s, v27.4s -add v13.4s, v13.4s, v27.4s -str q13, [x0, #944] -str q10, [x0, #1008] -ldr q10, [x0, #640] -ldr q13, [x0, #704] -sqrdmulh v27.4S, v29.4S, v21.s[0] -ldr q15, [x0, #0] -mul v29.4S, v29.4S,v22.s[0] -ldr q11, [x0, #64] -sqrdmulh v14.4S, v28.4S, v21.s[0] -ldr q0, [x0, #128] -mul v28.4S, v28.4S,v22.s[0] -ldr q26, [x0, #192] -sqrdmulh v7.4S, v1.4S, v21.s[0] -mul v1.4S, v1.4S,v22.s[0] -mla v25.4S, v24.4S, v31.s[0] -mla v29.4S, v27.4S, v31.s[0] -mla v28.4S, v14.4S, v31.s[0] -mla v1.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v30.4S, v21.s[0] -mul v30.4S, v30.4S,v22.s[0] -sub v14.4s, v9.4s, v25.4s -add v9.4s, v9.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v21.s[0] -mul v2.4S, v2.4S,v22.s[0] -sub v27.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -sqrdmulh v29.4S, v10.4S, v21.s[0] -mul v10.4S, v10.4S,v22.s[0] -sub v24.4s, v12.4s, v28.4s -add v12.4s, v12.4s, v28.4s -sqrdmulh v28.4S, v13.4S, v21.s[0] -mul v13.4S, v13.4S,v22.s[0] -mla v30.4S, v7.4S, v31.s[0] -sub v7.4s, v23.4s, v1.4s -mla v2.4S, v25.4S, v31.s[0] -add v23.4s, v23.4s, v1.4s -mla v10.4S, v29.4S, v31.s[0] -mla v13.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v12.4S, v21.s[1] -mul v12.4S, v12.4S,v22.s[1] -sub v29.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v23.4S, v21.s[1] -mul v23.4S, v23.4S,v22.s[1] -sub v1.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v21.s[1] -mul v9.4S, v9.4S,v22.s[1] -sub v25.4s, v0.4s, v10.4s -add v0.4s, v0.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v21.s[1] -mul v8.4S, v8.4S,v22.s[1] -mla v12.4S, v28.4S, v31.s[0] -sub v28.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -mla v23.4S, v30.4S, v31.s[0] -mla v9.4S, v2.4S, v31.s[0] -mla v8.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v24.4S, v21.s[2] -mul v24.4S, v24.4S,v22.s[2] -sub v2.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v7.4S, v21.s[2] -mul v7.4S, v7.4S,v22.s[2] -sub v30.4s, v26.4s, v23.4s -add v26.4s, v26.4s, v23.4s -sqrdmulh v23.4S, v14.4S, v21.s[2] -mul v14.4S, v14.4S,v22.s[2] -sub v13.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -sqrdmulh v9.4S, v27.4S, v21.s[2] -mul v27.4S, v27.4S,v22.s[2] -mla v24.4S, v10.4S, v31.s[0] -sub v10.4s, v11.4s, v8.4s -mla v7.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -mla v14.4S, v23.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v0.4S, v19.s[0] -mul v0.4S, v0.4S,v20.s[0] -sub v23.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v26.4S, v19.s[0] -mul v26.4S, v26.4S,v20.s[0] -sub v8.4s, v28.4s, v7.4s -add v28.4s, v28.4s, v7.4s -sqrdmulh v7.4S, v2.4S, v19.s[1] -mul v2.4S, v2.4S,v20.s[1] -sub v12.4s, v29.4s, v14.4s -add v29.4s, v29.4s, v14.4s -sqrdmulh v14.4S, v30.4S, v19.s[1] -mul v30.4S, v30.4S,v20.s[1] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v27.4s -add v1.4s, v1.4s, v27.4s -mla v26.4S, v24.4S, v31.s[0] -mla v2.4S, v7.4S, v31.s[0] -mla v30.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v25.4S, v19.s[2] -mul v25.4S, v25.4S,v20.s[2] -sub v7.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v28.4S, v19.s[2] -mul v28.4S, v28.4S,v20.s[2] -sub v24.4s, v11.4s, v26.4s -add v11.4s, v11.4s, v26.4s -sqrdmulh v26.4S, v23.4S, v19.s[3] -mul v23.4S, v23.4S,v20.s[3] -sub v27.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v8.4S, v19.s[3] -mul v8.4S, v8.4S,v20.s[3] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v30.4s -mla v28.4S, v0.4S, v31.s[0] -add v10.4s, v10.4s, v30.4s -mla v23.4S, v26.4S, v31.s[0] -mla v8.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v11.4S, v17.s[0] -mul v11.4S, v11.4S,v18.s[0] -sub v26.4s, v29.4s, v25.4s -add v29.4s, v29.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v17.s[1] -mul v24.4S, v24.4S,v18.s[1] -sub v30.4s, v1.4s, v28.4s -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v10.4S, v17.s[2] -mul v10.4S, v10.4S,v18.s[2] -sub v0.4s, v12.4s, v23.4s -add v12.4s, v12.4s, v23.4s -sqrdmulh v23.4S, v14.4S, v17.s[3] -mul v14.4S, v14.4S,v18.s[3] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -mla v24.4S, v25.4S, v31.s[0] -mla v10.4S, v28.4S, v31.s[0] -mla v14.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v1.4S, v3.s[0] -mul v1.4S, v1.4S,v16.s[0] -sub v28.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -str q15, [x0, #0] -str q28, [x0, #64] -sqrdmulh v28.4S, v30.4S, v3.s[1] -mul v30.4S, v30.4S,v16.s[1] -ldr q15, [x0, #784] -ldr q11, [x0, #848] -sub v25.4s, v7.4s, v24.4s -add v7.4s, v7.4s, v24.4s -str q7, [x0, #128] -str q25, [x0, #192] -sqrdmulh v25.4S, v9.4S, v3.s[2] -mul v9.4S, v9.4S,v16.s[2] -ldr q7, [x0, #912] -ldr q24, [x0, #976] -sub v8.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -str q13, [x0, #256] -str q8, [x0, #320] -sqrdmulh v8.4S, v2.4S, v3.s[3] -mul v2.4S, v2.4S,v16.s[3] -ldr q13, [x0, #272] -ldr q10, [x0, #336] -mla v1.4S, v23.4S, v31.s[0] -sub v23.4s, v27.4s, v14.4s -mla v30.4S, v28.4S, v31.s[0] -add v27.4s, v27.4s, v14.4s -str q27, [x0, #384] -str q23, [x0, #448] -mla v9.4S, v25.4S, v31.s[0] -ldr q25, [x0, #400] -ldr q23, [x0, #464] -mla v2.4S, v8.4S, v31.s[0] -sub v8.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sub v1.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sub v30.4s, v12.4s, v9.4s -add v12.4s, v12.4s, v9.4s -str q29, [x0, #512] -str q8, [x0, #576] -str q26, [x0, #640] -str q1, [x0, #704] -str q12, [x0, #768] -str q30, [x0, #832] -sqrdmulh v30.4S, v15.4S, v21.s[0] -ldr q12, [x0, #528] -ldr q1, [x0, #592] -mul v15.4S, v15.4S,v22.s[0] -sub v26.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -str q0, [x0, #896] -str q26, [x0, #960] -ldr q26, [x0, #656] -ldr q0, [x0, #720] -sqrdmulh v2.4S, v11.4S, v21.s[0] -ldr q8, [x0, #16] -mul v11.4S, v11.4S,v22.s[0] -ldr q29, [x0, #80] -sqrdmulh v9.4S, v7.4S, v21.s[0] -ldr q27, [x0, #144] -mul v7.4S, v7.4S,v22.s[0] -ldr q14, [x0, #208] -sqrdmulh v28.4S, v24.4S, v21.s[0] -mul v24.4S, v24.4S,v22.s[0] -mla v15.4S, v30.4S, v31.s[0] -mla v11.4S, v2.4S, v31.s[0] -mla v7.4S, v9.4S, v31.s[0] -mla v24.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v12.4S, v21.s[0] -mul v12.4S, v12.4S,v22.s[0] -sub v9.4s, v13.4s, v15.4s -add v13.4s, v13.4s, v15.4s -sqrdmulh v15.4S, v1.4S, v21.s[0] -mul v1.4S, v1.4S,v22.s[0] -sub v2.4s, v10.4s, v11.4s -add v10.4s, v10.4s, v11.4s -sqrdmulh v11.4S, v26.4S, v21.s[0] -mul v26.4S, v26.4S,v22.s[0] -sub v30.4s, v25.4s, v7.4s -add v25.4s, v25.4s, v7.4s -sqrdmulh v7.4S, v0.4S, v21.s[0] -mul v0.4S, v0.4S,v22.s[0] -mla v12.4S, v28.4S, v31.s[0] -sub v28.4s, v23.4s, v24.4s -mla v1.4S, v15.4S, v31.s[0] -add v23.4s, v23.4s, v24.4s -mla v26.4S, v11.4S, v31.s[0] -mla v0.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v25.4S, v21.s[1] -mul v25.4S, v25.4S,v22.s[1] -sub v11.4s, v8.4s, v12.4s -add v8.4s, v8.4s, v12.4s -sqrdmulh v12.4S, v23.4S, v21.s[1] -mul v23.4S, v23.4S,v22.s[1] -sub v24.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v21.s[1] -mul v13.4S, v13.4S,v22.s[1] -sub v15.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v10.4S, v21.s[1] -mul v10.4S, v10.4S,v22.s[1] -mla v25.4S, v7.4S, v31.s[0] -sub v7.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -mla v23.4S, v12.4S, v31.s[0] -mla v13.4S, v1.4S, v31.s[0] -mla v10.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v30.4S, v21.s[2] -mul v30.4S, v30.4S,v22.s[2] -sub v1.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v21.s[2] -mul v28.4S, v28.4S,v22.s[2] -sub v12.4s, v14.4s, v23.4s -add v14.4s, v14.4s, v23.4s -sqrdmulh v23.4S, v9.4S, v21.s[2] -mul v9.4S, v9.4S,v22.s[2] -sub v0.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v2.4S, v21.s[2] -mul v2.4S, v2.4S,v22.s[2] -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v29.4s, v10.4s -mla v28.4S, v25.4S, v31.s[0] -add v29.4s, v29.4s, v10.4s -mla v9.4S, v23.4S, v31.s[0] -mla v2.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v27.4S, v19.s[0] -mul v27.4S, v27.4S,v20.s[0] -sub v23.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v14.4S, v19.s[0] -mul v14.4S, v14.4S,v20.s[0] -sub v10.4s, v7.4s, v28.4s -add v7.4s, v7.4s, v28.4s -sqrdmulh v28.4S, v1.4S, v19.s[1] -mul v1.4S, v1.4S,v20.s[1] -sub v25.4s, v11.4s, v9.4s -add v11.4s, v11.4s, v9.4s -sqrdmulh v9.4S, v12.4S, v19.s[1] -mul v12.4S, v12.4S,v20.s[1] -mla v27.4S, v13.4S, v31.s[0] -sub v13.4s, v24.4s, v2.4s -add v24.4s, v24.4s, v2.4s -mla v14.4S, v30.4S, v31.s[0] -mla v1.4S, v28.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v15.4S, v19.s[2] -mul v15.4S, v15.4S,v20.s[2] -sub v28.4s, v8.4s, v27.4s -add v8.4s, v8.4s, v27.4s -sqrdmulh v27.4S, v7.4S, v19.s[2] -mul v7.4S, v7.4S,v20.s[2] -sub v30.4s, v29.4s, v14.4s -add v29.4s, v29.4s, v14.4s -sqrdmulh v14.4S, v23.4S, v19.s[3] -mul v23.4S, v23.4S,v20.s[3] -sub v2.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v19.s[3] -mul v10.4S, v10.4S,v20.s[3] -mla v15.4S, v9.4S, v31.s[0] -sub v9.4s, v26.4s, v12.4s -mla v7.4S, v27.4S, v31.s[0] -add v26.4s, v26.4s, v12.4s -mla v23.4S, v14.4S, v31.s[0] -mla v10.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v29.4S, v17.s[0] -mul v29.4S, v29.4S,v18.s[0] -sub v14.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v30.4S, v17.s[1] -mul v30.4S, v30.4S,v18.s[1] -sub v12.4s, v24.4s, v7.4s -add v24.4s, v24.4s, v7.4s -sqrdmulh v7.4S, v26.4S, v17.s[2] -mul v26.4S, v26.4S,v18.s[2] -sub v27.4s, v25.4s, v23.4s -add v25.4s, v25.4s, v23.4s -sqrdmulh v23.4S, v9.4S, v17.s[3] -mul v9.4S, v9.4S,v18.s[3] -mla v29.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v30.4S, v15.4S, v31.s[0] -mla v26.4S, v7.4S, v31.s[0] -mla v9.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v24.4S, v3.s[0] -mul v24.4S, v24.4S,v16.s[0] -sub v7.4s, v8.4s, v29.4s -add v8.4s, v8.4s, v29.4s -str q8, [x0, #16] -str q7, [x0, #80] -sqrdmulh v7.4S, v12.4S, v3.s[1] -mul v12.4S, v12.4S,v16.s[1] -sub v8.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -str q28, [x0, #144] -str q8, [x0, #208] -sqrdmulh v8.4S, v13.4S, v3.s[2] -mul v13.4S, v13.4S,v16.s[2] -sub v28.4s, v0.4s, v26.4s -add v0.4s, v0.4s, v26.4s -str q0, [x0, #272] -str q28, [x0, #336] -sqrdmulh v28.4S, v1.4S, v3.s[3] -mul v1.4S, v1.4S,v16.s[3] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v2.4s, v9.4s -mla v12.4S, v7.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -str q2, [x0, #400] -str q23, [x0, #464] -mla v13.4S, v8.4S, v31.s[0] -mla v1.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -sub v24.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sub v12.4s, v25.4s, v13.4s -add v25.4s, v25.4s, v13.4s -str q11, [x0, #528] -str q28, [x0, #592] -str q14, [x0, #656] -str q24, [x0, #720] -str q25, [x0, #784] -str q12, [x0, #848] -sub v3.4s, v27.4s, v1.4s -add v27.4s, v27.4s, v1.4s -str q27, [x0, #912] -str q3, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q10, [x17, #+176] -ldr q15, [x17, #+192] -ldr q29, [x17, #+208] -ldr q30, [x17, #+224] -ldr q26, [x17, #+240] -ldr q0, [x0, #32] -ldr q7, [x0, #48] -ldr q9, [x0, #0] -ldr q2, [x0, #16] -sqrdmulh v23.4S, v0.4S, v5.s[0] -mul v0.4S, v0.4S,v4.s[0] -mla v0.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v7.4S, v5.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v0.4S, v31.s[0] -sub v0.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v2.4S, v5.s[1] -mul v2.4S, v2.4S,v4.s[1] -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v2.4s -add v9.4s, v9.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v5.s[2] -mul v0.4S, v0.4S,v4.s[2] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v0.4s -add v23.4s, v23.4s, v0.4s -trn1 v0.4S, v9.4S, v7.4S -trn2 v8.4S, v9.4S, v7.4S -trn1 v13.4S, v23.4S, v2.4S -trn2 v11.4S, v23.4S, v2.4S -trn2 v23.2D, v0.2D, v13.2D -trn2 v2.2D, v8.2D, v11.2D -trn1 v9.2D, v0.2D, v13.2D -trn1 v7.2D, v8.2D, v11.2D -sqrdmulh v11.4S, v23.4S, v10.4S -mul v23.4S, v23.4S,v6.4S -mla v23.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v2.4S, v10.4S -mul v2.4S, v2.4S,v6.4S -mla v2.4S, v23.4S, v31.s[0] -sub v23.4s, v7.4s, v2.4s -add v7.4s, v7.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v29.4S -mul v7.4S, v7.4S,v15.4S -mla v7.4S, v2.4S, v31.s[0] -sub v2.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v23.4S, v26.4S -mul v23.4S, v23.4S,v30.4S -mla v23.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v23.4s -add v11.4s, v11.4s, v23.4s -str q9, [x0, #0] -str q2, [x0, #16] -str q11, [x0, #32] -str q7, [x0, #48] -ldr q7, [x17, #+256] -ldr q11, [x17, #+272] -ldr q2, [x17, #+288] -ldr q9, [x17, #+304] -ldr q23, [x17, #+320] -ldr q8, [x17, #+336] -ldr q13, [x17, #+352] -ldr q0, [x17, #+368] -ldr q26, [x0, #96] -ldr q30, [x0, #112] -ldr q29, [x0, #64] -ldr q15, [x0, #80] -sqrdmulh v10.4S, v26.4S, v11.s[0] -mul v26.4S, v26.4S,v7.s[0] -mla v26.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v26.4s -add v29.4s, v29.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v11.s[0] -mul v30.4S, v30.4S,v7.s[0] -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v15.4S, v11.s[1] -mul v15.4S, v15.4S,v7.s[1] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v26.4S, v11.s[2] -mul v26.4S, v26.4S,v7.s[2] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -trn1 v26.4S, v29.4S, v30.4S -trn2 v6.4S, v29.4S, v30.4S -trn1 v5.4S, v10.4S, v15.4S -trn2 v4.4S, v10.4S, v15.4S -trn2 v10.2D, v26.2D, v5.2D -trn2 v15.2D, v6.2D, v4.2D -trn1 v29.2D, v26.2D, v5.2D -trn1 v30.2D, v6.2D, v4.2D -sqrdmulh v4.4S, v10.4S, v9.4S -mul v10.4S, v10.4S,v2.4S -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v10.4s -add v29.4s, v29.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v9.4S -mul v15.4S, v15.4S,v2.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -sqrdmulh v15.4S, v30.4S, v8.4S -mul v30.4S, v30.4S,v23.4S -mla v30.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v13.4S -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v4.4s, v10.4s -add v4.4s, v4.4s, v10.4s -str q29, [x0, #64] -str q15, [x0, #80] -str q4, [x0, #96] -str q30, [x0, #112] -ldr q30, [x17, #+384] -ldr q4, [x17, #+400] -ldr q15, [x17, #+416] -ldr q29, [x17, #+432] -ldr q10, [x17, #+448] -ldr q6, [x17, #+464] -ldr q5, [x17, #+480] -ldr q26, [x17, #+496] -ldr q0, [x0, #160] -ldr q13, [x0, #176] -ldr q8, [x0, #128] -ldr q23, [x0, #144] -sqrdmulh v9.4S, v0.4S, v4.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v0.4s -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v4.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v23.4s, v13.4s -add v23.4s, v23.4s, v13.4s -sqrdmulh v13.4S, v23.4S, v4.s[1] -mul v23.4S, v23.4S,v30.s[1] -mla v23.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v23.4s -add v8.4s, v8.4s, v23.4s -sqrdmulh v23.4S, v0.4S, v4.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -trn1 v0.4S, v8.4S, v13.4S -trn2 v2.4S, v8.4S, v13.4S -trn1 v11.4S, v9.4S, v23.4S -trn2 v7.4S, v9.4S, v23.4S -trn2 v9.2D, v0.2D, v11.2D -trn2 v23.2D, v2.2D, v7.2D -trn1 v8.2D, v0.2D, v11.2D -trn1 v13.2D, v2.2D, v7.2D -sqrdmulh v7.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v15.4S -mla v9.4S, v7.4S, v31.s[0] -sub v7.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v23.4S, v29.4S -mul v23.4S, v23.4S,v15.4S -mla v23.4S, v9.4S, v31.s[0] -sub v9.4s, v13.4s, v23.4s -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v13.4S, v6.4S -mul v13.4S, v13.4S,v10.4S -mla v13.4S, v23.4S, v31.s[0] -sub v23.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v26.4S -mul v9.4S, v9.4S,v5.4S -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -str q8, [x0, #128] -str q23, [x0, #144] -str q7, [x0, #160] -str q13, [x0, #176] -ldr q13, [x17, #+512] -ldr q7, [x17, #+528] -ldr q23, [x17, #+544] -ldr q8, [x17, #+560] -ldr q9, [x17, #+576] -ldr q2, [x17, #+592] -ldr q11, [x17, #+608] -ldr q0, [x17, #+624] -ldr q26, [x0, #224] -ldr q5, [x0, #240] -ldr q6, [x0, #192] -ldr q10, [x0, #208] -sqrdmulh v29.4S, v26.4S, v7.s[0] -mul v26.4S, v26.4S,v13.s[0] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v26.4s -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v7.s[0] -mul v5.4S, v5.4S,v13.s[0] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v7.s[1] -mul v10.4S, v10.4S,v13.s[1] -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v26.4S, v7.s[2] -mul v26.4S, v26.4S,v13.s[2] -mla v26.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v26.4s -add v29.4s, v29.4s, v26.4s -trn1 v26.4S, v6.4S, v5.4S -trn2 v15.4S, v6.4S, v5.4S -trn1 v4.4S, v29.4S, v10.4S -trn2 v30.4S, v29.4S, v10.4S -trn2 v29.2D, v26.2D, v4.2D -trn2 v10.2D, v15.2D, v30.2D -trn1 v6.2D, v26.2D, v4.2D -trn1 v5.2D, v15.2D, v30.2D -sqrdmulh v30.4S, v29.4S, v8.4S -mul v29.4S, v29.4S,v23.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v29.4s -add v6.4s, v6.4s, v29.4s -sqrdmulh v29.4S, v10.4S, v8.4S -mul v10.4S, v10.4S,v23.4S -mla v10.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v5.4S, v2.4S -mul v5.4S, v5.4S,v9.4S -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v29.4S, v0.4S -mul v29.4S, v29.4S,v11.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -str q6, [x0, #192] -str q10, [x0, #208] -str q30, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q30, [x17, #+656] -ldr q10, [x17, #+672] -ldr q6, [x17, #+688] -ldr q29, [x17, #+704] -ldr q15, [x17, #+720] -ldr q4, [x17, #+736] -ldr q26, [x17, #+752] -ldr q0, [x0, #288] -ldr q11, [x0, #304] -ldr q2, [x0, #256] -ldr q9, [x0, #272] -sqrdmulh v8.4S, v0.4S, v30.s[0] -mul v0.4S, v0.4S,v5.s[0] -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v30.s[0] -mul v11.4S, v11.4S,v5.s[0] -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v11.4s -add v9.4s, v9.4s, v11.4s -sqrdmulh v11.4S, v9.4S, v30.s[1] -mul v9.4S, v9.4S,v5.s[1] -mla v9.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v0.4S, v30.s[2] -mul v0.4S, v0.4S,v5.s[2] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v0.4s -add v8.4s, v8.4s, v0.4s -trn1 v0.4S, v2.4S, v11.4S -trn2 v23.4S, v2.4S, v11.4S -trn1 v7.4S, v8.4S, v9.4S -trn2 v13.4S, v8.4S, v9.4S -trn2 v8.2D, v0.2D, v7.2D -trn2 v9.2D, v23.2D, v13.2D -trn1 v2.2D, v0.2D, v7.2D -trn1 v11.2D, v23.2D, v13.2D -sqrdmulh v13.4S, v8.4S, v6.4S -mul v8.4S, v8.4S,v10.4S -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v2.4s, v8.4s -add v2.4s, v2.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v6.4S -mul v9.4S, v9.4S,v10.4S -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v9.4s -add v11.4s, v11.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v15.4S -mul v11.4S, v11.4S,v29.4S -mla v11.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v11.4s -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v26.4S -mul v8.4S, v8.4S,v4.4S -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v13.4s, v8.4s -add v13.4s, v13.4s, v8.4s -str q2, [x0, #256] -str q9, [x0, #272] -str q13, [x0, #288] -str q11, [x0, #304] -ldr q11, [x17, #+768] -ldr q13, [x17, #+784] -ldr q9, [x17, #+800] -ldr q2, [x17, #+816] -ldr q8, [x17, #+832] -ldr q23, [x17, #+848] -ldr q7, [x17, #+864] -ldr q0, [x17, #+880] -ldr q26, [x0, #352] -ldr q4, [x0, #368] -ldr q15, [x0, #320] -ldr q29, [x0, #336] -sqrdmulh v6.4S, v26.4S, v13.s[0] -mul v26.4S, v26.4S,v11.s[0] -mla v26.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v26.4s -add v15.4s, v15.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v13.s[0] -mul v4.4S, v4.4S,v11.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v13.s[1] -mul v29.4S, v29.4S,v11.s[1] -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v26.4S, v13.s[2] -mul v26.4S, v26.4S,v11.s[2] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v26.4s -add v6.4s, v6.4s, v26.4s -trn1 v26.4S, v15.4S, v4.4S -trn2 v10.4S, v15.4S, v4.4S -trn1 v30.4S, v6.4S, v29.4S -trn2 v5.4S, v6.4S, v29.4S -trn2 v6.2D, v26.2D, v30.2D -trn2 v29.2D, v10.2D, v5.2D -trn1 v15.2D, v26.2D, v30.2D -trn1 v4.2D, v10.2D, v5.2D -sqrdmulh v5.4S, v6.4S, v2.4S -mul v6.4S, v6.4S,v9.4S -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -sqrdmulh v6.4S, v29.4S, v2.4S -mul v29.4S, v29.4S,v9.4S -mla v29.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v29.4s -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v23.4S -mul v4.4S, v4.4S,v8.4S -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v0.4S -mul v6.4S, v6.4S,v7.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -str q15, [x0, #320] -str q29, [x0, #336] -str q5, [x0, #352] -str q4, [x0, #368] -ldr q4, [x17, #+896] -ldr q5, [x17, #+912] -ldr q29, [x17, #+928] -ldr q15, [x17, #+944] -ldr q6, [x17, #+960] -ldr q10, [x17, #+976] -ldr q30, [x17, #+992] -ldr q26, [x17, #+1008] -ldr q0, [x0, #416] -ldr q7, [x0, #432] -ldr q23, [x0, #384] -ldr q8, [x0, #400] -sqrdmulh v2.4S, v0.4S, v5.s[0] -mul v0.4S, v0.4S,v4.s[0] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v0.4s -add v23.4s, v23.4s, v0.4s -sqrdmulh v0.4S, v7.4S, v5.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v0.4S, v31.s[0] -sub v0.4s, v8.4s, v7.4s -add v8.4s, v8.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v5.s[1] -mul v8.4S, v8.4S,v4.s[1] -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v23.4s, v8.4s -add v23.4s, v23.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v5.s[2] -mul v0.4S, v0.4S,v4.s[2] -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -trn1 v0.4S, v23.4S, v7.4S -trn2 v9.4S, v23.4S, v7.4S -trn1 v13.4S, v2.4S, v8.4S -trn2 v11.4S, v2.4S, v8.4S -trn2 v2.2D, v0.2D, v13.2D -trn2 v8.2D, v9.2D, v11.2D -trn1 v23.2D, v0.2D, v13.2D -trn1 v7.2D, v9.2D, v11.2D -sqrdmulh v11.4S, v2.4S, v15.4S -mul v2.4S, v2.4S,v29.4S -mla v2.4S, v11.4S, v31.s[0] -sub v11.4s, v23.4s, v2.4s -add v23.4s, v23.4s, v2.4s -sqrdmulh v2.4S, v8.4S, v15.4S -mul v8.4S, v8.4S,v29.4S -mla v8.4S, v2.4S, v31.s[0] -sub v2.4s, v7.4s, v8.4s -add v7.4s, v7.4s, v8.4s -sqrdmulh v8.4S, v7.4S, v10.4S -mul v7.4S, v7.4S,v6.4S -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v7.4s -add v23.4s, v23.4s, v7.4s -sqrdmulh v7.4S, v2.4S, v26.4S -mul v2.4S, v2.4S,v30.4S -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -str q23, [x0, #384] -str q8, [x0, #400] -str q11, [x0, #416] -str q7, [x0, #432] -ldr q7, [x17, #+1024] -ldr q11, [x17, #+1040] -ldr q8, [x17, #+1056] -ldr q23, [x17, #+1072] -ldr q2, [x17, #+1088] -ldr q9, [x17, #+1104] -ldr q13, [x17, #+1120] -ldr q0, [x17, #+1136] -ldr q26, [x0, #480] -ldr q30, [x0, #496] -ldr q10, [x0, #448] -ldr q6, [x0, #464] -sqrdmulh v15.4S, v26.4S, v11.s[0] -mul v26.4S, v26.4S,v7.s[0] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v11.s[0] -mul v30.4S, v30.4S,v7.s[0] -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v6.4s, v30.4s -add v6.4s, v6.4s, v30.4s -sqrdmulh v30.4S, v6.4S, v11.s[1] -mul v6.4S, v6.4S,v7.s[1] -mla v6.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v26.4S, v11.s[2] -mul v26.4S, v26.4S,v7.s[2] -mla v26.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v26.4s -add v15.4s, v15.4s, v26.4s -trn1 v26.4S, v10.4S, v30.4S -trn2 v29.4S, v10.4S, v30.4S -trn1 v5.4S, v15.4S, v6.4S -trn2 v4.4S, v15.4S, v6.4S -trn2 v15.2D, v26.2D, v5.2D -trn2 v6.2D, v29.2D, v4.2D -trn1 v10.2D, v26.2D, v5.2D -trn1 v30.2D, v29.2D, v4.2D -sqrdmulh v4.4S, v15.4S, v23.4S -mul v15.4S, v15.4S,v8.4S -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v6.4S, v23.4S -mul v6.4S, v6.4S,v8.4S -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v6.4s -add v30.4s, v30.4s, v6.4s -sqrdmulh v6.4S, v30.4S, v9.4S -mul v30.4S, v30.4S,v2.4S -mla v30.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v30.4s -add v10.4s, v10.4s, v30.4s -sqrdmulh v30.4S, v15.4S, v0.4S -mul v15.4S, v15.4S,v13.4S -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v4.4s, v15.4s -add v4.4s, v4.4s, v15.4s -str q10, [x0, #448] -str q6, [x0, #464] -str q4, [x0, #480] -str q30, [x0, #496] -ldr q30, [x17, #+1152] -ldr q4, [x17, #+1168] -ldr q6, [x17, #+1184] -ldr q10, [x17, #+1200] -ldr q15, [x17, #+1216] -ldr q29, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q26, [x17, #+1264] -ldr q0, [x0, #544] -ldr q13, [x0, #560] -ldr q9, [x0, #512] -ldr q2, [x0, #528] -sqrdmulh v23.4S, v0.4S, v4.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v4.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v2.4s, v13.4s -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v2.4S, v4.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v2.4s -add v9.4s, v9.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v4.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v0.4s -add v23.4s, v23.4s, v0.4s -trn1 v0.4S, v9.4S, v13.4S -trn2 v8.4S, v9.4S, v13.4S -trn1 v11.4S, v23.4S, v2.4S -trn2 v7.4S, v23.4S, v2.4S -trn2 v23.2D, v0.2D, v11.2D -trn2 v2.2D, v8.2D, v7.2D -trn1 v9.2D, v0.2D, v11.2D -trn1 v13.2D, v8.2D, v7.2D -sqrdmulh v7.4S, v23.4S, v10.4S -mul v23.4S, v23.4S,v6.4S -mla v23.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -sqrdmulh v23.4S, v2.4S, v10.4S -mul v2.4S, v2.4S,v6.4S -mla v2.4S, v23.4S, v31.s[0] -sub v23.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v29.4S -mul v13.4S, v13.4S,v15.4S -mla v13.4S, v2.4S, v31.s[0] -sub v2.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sqrdmulh v13.4S, v23.4S, v26.4S -mul v23.4S, v23.4S,v5.4S -mla v23.4S, v13.4S, v31.s[0] -sub v13.4s, v7.4s, v23.4s -add v7.4s, v7.4s, v23.4s -str q9, [x0, #512] -str q2, [x0, #528] -str q7, [x0, #544] -str q13, [x0, #560] -ldr q13, [x17, #+1280] -ldr q7, [x17, #+1296] -ldr q2, [x17, #+1312] -ldr q9, [x17, #+1328] -ldr q23, [x17, #+1344] -ldr q8, [x17, #+1360] -ldr q11, [x17, #+1376] -ldr q0, [x17, #+1392] -ldr q26, [x0, #608] -ldr q5, [x0, #624] -ldr q29, [x0, #576] -ldr q15, [x0, #592] -sqrdmulh v10.4S, v26.4S, v7.s[0] -mul v26.4S, v26.4S,v13.s[0] -mla v26.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v26.4s -add v29.4s, v29.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v7.s[0] -mul v5.4S, v5.4S,v13.s[0] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v7.s[1] -mul v15.4S, v15.4S,v13.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v26.4S, v7.s[2] -mul v26.4S, v26.4S,v13.s[2] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -trn1 v26.4S, v29.4S, v5.4S -trn2 v6.4S, v29.4S, v5.4S -trn1 v4.4S, v10.4S, v15.4S -trn2 v30.4S, v10.4S, v15.4S -trn2 v10.2D, v26.2D, v4.2D -trn2 v15.2D, v6.2D, v30.2D -trn1 v29.2D, v26.2D, v4.2D -trn1 v5.2D, v6.2D, v30.2D -sqrdmulh v30.4S, v10.4S, v9.4S -mul v10.4S, v10.4S,v2.4S -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v10.4s -add v29.4s, v29.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v9.4S -mul v15.4S, v15.4S,v2.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v15.4s -add v5.4s, v5.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v8.4S -mul v5.4S, v5.4S,v23.4S -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v5.4s -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v0.4S -mul v10.4S, v10.4S,v11.4S -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v10.4s -add v30.4s, v30.4s, v10.4s -str q29, [x0, #576] -str q15, [x0, #592] -str q30, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q30, [x17, #+1424] -ldr q15, [x17, #+1440] -ldr q29, [x17, #+1456] -ldr q10, [x17, #+1472] -ldr q6, [x17, #+1488] -ldr q4, [x17, #+1504] -ldr q26, [x17, #+1520] -ldr q0, [x0, #672] -ldr q11, [x0, #688] -ldr q8, [x0, #640] -ldr q23, [x0, #656] -sqrdmulh v9.4S, v0.4S, v30.s[0] -mul v0.4S, v0.4S,v5.s[0] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v0.4s -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v30.s[0] -mul v11.4S, v11.4S,v5.s[0] -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v23.4s, v11.4s -add v23.4s, v23.4s, v11.4s -sqrdmulh v11.4S, v23.4S, v30.s[1] -mul v23.4S, v23.4S,v5.s[1] -mla v23.4S, v11.4S, v31.s[0] -sub v11.4s, v8.4s, v23.4s -add v8.4s, v8.4s, v23.4s -sqrdmulh v23.4S, v0.4S, v30.s[2] -mul v0.4S, v0.4S,v5.s[2] -mla v0.4S, v23.4S, v31.s[0] -sub v23.4s, v9.4s, v0.4s -add v9.4s, v9.4s, v0.4s -trn1 v0.4S, v8.4S, v11.4S -trn2 v2.4S, v8.4S, v11.4S -trn1 v7.4S, v9.4S, v23.4S -trn2 v13.4S, v9.4S, v23.4S -trn2 v9.2D, v0.2D, v7.2D -trn2 v23.2D, v2.2D, v13.2D -trn1 v8.2D, v0.2D, v7.2D -trn1 v11.2D, v2.2D, v13.2D -sqrdmulh v13.4S, v9.4S, v29.4S -mul v9.4S, v9.4S,v15.4S -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v23.4S, v29.4S -mul v23.4S, v23.4S,v15.4S -mla v23.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v23.4s -add v11.4s, v11.4s, v23.4s -sqrdmulh v23.4S, v11.4S, v6.4S -mul v11.4S, v11.4S,v10.4S -mla v11.4S, v23.4S, v31.s[0] -sub v23.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v9.4S, v26.4S -mul v9.4S, v9.4S,v4.4S -mla v9.4S, v11.4S, v31.s[0] -sub v11.4s, v13.4s, v9.4s -add v13.4s, v13.4s, v9.4s -str q8, [x0, #640] -str q23, [x0, #656] -str q13, [x0, #672] -str q11, [x0, #688] -ldr q11, [x17, #+1536] -ldr q13, [x17, #+1552] -ldr q23, [x17, #+1568] -ldr q8, [x17, #+1584] -ldr q9, [x17, #+1600] -ldr q2, [x17, #+1616] -ldr q7, [x17, #+1632] -ldr q0, [x17, #+1648] -ldr q26, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q10, [x0, #720] -sqrdmulh v29.4S, v26.4S, v13.s[0] -mul v26.4S, v26.4S,v11.s[0] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v26.4s -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v13.s[0] -mul v4.4S, v4.4S,v11.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v13.s[1] -mul v10.4S, v10.4S,v11.s[1] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v26.4S, v13.s[2] -mul v26.4S, v26.4S,v11.s[2] -mla v26.4S, v10.4S, v31.s[0] -sub v10.4s, v29.4s, v26.4s -add v29.4s, v29.4s, v26.4s -trn1 v26.4S, v6.4S, v4.4S -trn2 v15.4S, v6.4S, v4.4S -trn1 v30.4S, v29.4S, v10.4S -trn2 v5.4S, v29.4S, v10.4S -trn2 v29.2D, v26.2D, v30.2D -trn2 v10.2D, v15.2D, v5.2D -trn1 v6.2D, v26.2D, v30.2D -trn1 v4.2D, v15.2D, v5.2D -sqrdmulh v5.4S, v29.4S, v8.4S -mul v29.4S, v29.4S,v23.4S -mla v29.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v29.4s -add v6.4s, v6.4s, v29.4s -sqrdmulh v29.4S, v10.4S, v8.4S -mul v10.4S, v10.4S,v23.4S -mla v10.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v10.4s -add v4.4s, v4.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v2.4S -mul v4.4S, v4.4S,v9.4S -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v0.4S -mul v29.4S, v29.4S,v7.4S -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -str q6, [x0, #704] -str q10, [x0, #720] -str q5, [x0, #736] -str q4, [x0, #752] -ldr q4, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q10, [x17, #+1696] -ldr q6, [x17, #+1712] -ldr q29, [x17, #+1728] -ldr q15, [x17, #+1744] -ldr q30, [x17, #+1760] -ldr q26, [x17, #+1776] -ldr q0, [x0, #800] -ldr q7, [x0, #816] -ldr q2, [x0, #768] -ldr q9, [x0, #784] -sqrdmulh v8.4S, v0.4S, v5.s[0] -mul v0.4S, v0.4S,v4.s[0] -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v7.4S, v5.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v9.4S, v5.s[1] -mul v9.4S, v9.4S,v4.s[1] -mla v9.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v0.4S, v5.s[2] -mul v0.4S, v0.4S,v4.s[2] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v0.4s -add v8.4s, v8.4s, v0.4s -trn1 v0.4S, v2.4S, v7.4S -trn2 v23.4S, v2.4S, v7.4S -trn1 v13.4S, v8.4S, v9.4S -trn2 v11.4S, v8.4S, v9.4S -trn2 v8.2D, v0.2D, v13.2D -trn2 v9.2D, v23.2D, v11.2D -trn1 v2.2D, v0.2D, v13.2D -trn1 v7.2D, v23.2D, v11.2D -sqrdmulh v11.4S, v8.4S, v6.4S -mul v8.4S, v8.4S,v10.4S -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v8.4s -add v2.4s, v2.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v6.4S -mul v9.4S, v9.4S,v10.4S -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v15.4S -mul v7.4S, v7.4S,v29.4S -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v26.4S -mul v8.4S, v8.4S,v30.4S -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v8.4s -add v11.4s, v11.4s, v8.4s -str q2, [x0, #768] -str q9, [x0, #784] -str q11, [x0, #800] -str q7, [x0, #816] -ldr q7, [x17, #+1792] -ldr q11, [x17, #+1808] -ldr q9, [x17, #+1824] -ldr q2, [x17, #+1840] -ldr q8, [x17, #+1856] -ldr q23, [x17, #+1872] -ldr q13, [x17, #+1888] -ldr q0, [x17, #+1904] -ldr q26, [x0, #864] -ldr q30, [x0, #880] -ldr q15, [x0, #832] -ldr q29, [x0, #848] -sqrdmulh v6.4S, v26.4S, v11.s[0] -mul v26.4S, v26.4S,v7.s[0] -mla v26.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v26.4s -add v15.4s, v15.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v11.s[0] -mul v30.4S, v30.4S,v7.s[0] -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v11.s[1] -mul v29.4S, v29.4S,v7.s[1] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v26.4S, v11.s[2] -mul v26.4S, v26.4S,v7.s[2] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v26.4s -add v6.4s, v6.4s, v26.4s -trn1 v26.4S, v15.4S, v30.4S -trn2 v10.4S, v15.4S, v30.4S -trn1 v5.4S, v6.4S, v29.4S -trn2 v4.4S, v6.4S, v29.4S -trn2 v6.2D, v26.2D, v5.2D -trn2 v29.2D, v10.2D, v4.2D -trn1 v15.2D, v26.2D, v5.2D -trn1 v30.2D, v10.2D, v4.2D -sqrdmulh v4.4S, v6.4S, v2.4S -mul v6.4S, v6.4S,v9.4S -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -sqrdmulh v6.4S, v29.4S, v2.4S -mul v29.4S, v29.4S,v9.4S -mla v29.4S, v6.4S, v31.s[0] -sub v6.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v23.4S -mul v30.4S, v30.4S,v8.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v6.4S, v0.4S -mul v6.4S, v6.4S,v13.4S -mla v6.4S, v30.4S, v31.s[0] -sub v30.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -str q15, [x0, #832] -str q29, [x0, #848] -str q4, [x0, #864] -str q30, [x0, #880] -ldr q30, [x17, #+1920] -ldr q4, [x17, #+1936] -ldr q29, [x17, #+1952] -ldr q15, [x17, #+1968] -ldr q6, [x17, #+1984] -ldr q10, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q26, [x17, #+2032] -ldr q0, [x0, #928] -ldr q13, [x0, #944] -ldr q23, [x0, #896] -ldr q8, [x0, #912] -sqrdmulh v2.4S, v0.4S, v4.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v0.4s -add v23.4s, v23.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v4.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v4.s[1] -mul v8.4S, v8.4S,v30.s[1] -mla v8.4S, v13.4S, v31.s[0] -sub v13.4s, v23.4s, v8.4s -add v23.4s, v23.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v4.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -trn1 v0.4S, v23.4S, v13.4S -trn2 v9.4S, v23.4S, v13.4S -trn1 v11.4S, v2.4S, v8.4S -trn2 v7.4S, v2.4S, v8.4S -trn2 v2.2D, v0.2D, v11.2D -trn2 v8.2D, v9.2D, v7.2D -trn1 v23.2D, v0.2D, v11.2D -trn1 v13.2D, v9.2D, v7.2D -sqrdmulh v7.4S, v2.4S, v15.4S -mul v2.4S, v2.4S,v29.4S -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v23.4s, v2.4s -add v23.4s, v23.4s, v2.4s -sqrdmulh v2.4S, v8.4S, v15.4S -mul v8.4S, v8.4S,v29.4S -mla v8.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v8.4s -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v13.4S, v10.4S -mul v13.4S, v13.4S,v6.4S -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v13.4s -add v23.4s, v23.4s, v13.4s -sqrdmulh v13.4S, v2.4S, v26.4S -mul v2.4S, v2.4S,v5.4S -mla v2.4S, v13.4S, v31.s[0] -sub v13.4s, v7.4s, v2.4s -add v7.4s, v7.4s, v2.4s -str q23, [x0, #896] -str q8, [x0, #912] -str q7, [x0, #928] -str q13, [x0, #944] -ldr q13, [x17, #+2048] -ldr q7, [x17, #+2064] -ldr q8, [x17, #+2080] -ldr q23, [x17, #+2096] -ldr q2, [x17, #+2112] -ldr q9, [x17, #+2128] -ldr q11, [x17, #+2144] -ldr q0, [x17, #+2160] -ldr q26, [x0, #992] -ldr q5, [x0, #1008] -ldr q10, [x0, #960] -ldr q6, [x0, #976] -sqrdmulh v15.4S, v26.4S, v7.s[0] -mul v26.4S, v26.4S,v13.s[0] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v7.s[0] -mul v5.4S, v5.4S,v13.s[0] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v7.s[1] -mul v6.4S, v6.4S,v13.s[1] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v26.4S, v7.s[2] -mul v26.4S, v26.4S,v13.s[2] -mla v26.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v26.4s -add v15.4s, v15.4s, v26.4s -trn1 v26.4S, v10.4S, v5.4S -trn2 v29.4S, v10.4S, v5.4S -trn1 v4.4S, v15.4S, v6.4S -trn2 v30.4S, v15.4S, v6.4S -trn2 v15.2D, v26.2D, v4.2D -trn2 v6.2D, v29.2D, v30.2D -trn1 v10.2D, v26.2D, v4.2D -trn1 v5.2D, v29.2D, v30.2D -sqrdmulh v30.4S, v15.4S, v23.4S -mul v15.4S, v15.4S,v8.4S -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v6.4S, v23.4S -mul v6.4S, v6.4S,v8.4S -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v9.4S -mul v5.4S, v5.4S,v2.4S -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v0.4S -mul v15.4S, v15.4S,v11.4S -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -str q10, [x0, #960] -str q6, [x0, #976] -str q30, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s deleted file mode 100644 index 1628189..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_7_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_7_0 -.global _ntt_u32_full_neon_asm_var_4_4_7_0 -ntt_u32_full_neon_asm_var_4_4_7_0: -_ntt_u32_full_neon_asm_var_4_4_7_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x17, #+128] -ldr q1, [x17, #+144] -ldr q10, [x17, #+160] -ldr q21, [x17, #+176] -ldr q22, [x17, #+192] -ldr q15, [x17, #+208] -ldr q3, [x17, #+224] -ldr q12, [x17, #+240] -ldr q4, [x0, #32] -ldr q30, [x0, #48] -ldr q29, [x0, #0] -ldr q28, [x0, #16] -sqrdmulh v27.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v1.s[1] -mul v28.4S, v28.4S,v18.s[1] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v18.s[2] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -trn1 v4.4S, v29.4S, v30.4S -trn2 v26.4S, v29.4S, v30.4S -trn1 v25.4S, v27.4S, v28.4S -trn2 v24.4S, v27.4S, v28.4S -trn2 v27.2D, v4.2D, v25.2D -trn2 v28.2D, v26.2D, v24.2D -trn1 v29.2D, v4.2D, v25.2D -trn1 v30.2D, v26.2D, v24.2D -sqrdmulh v24.4S, v27.4S, v21.4S -mul v27.4S, v27.4S,v10.4S -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v21.4S -mul v28.4S, v28.4S,v10.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v30.4S, v15.4S -mul v30.4S, v30.4S,v22.4S -mla v30.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v12.4S -mul v27.4S, v27.4S,v3.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -str q29, [x0, #0] -str q28, [x0, #16] -str q24, [x0, #32] -str q30, [x0, #48] -ldr q30, [x17, #+256] -ldr q24, [x17, #+272] -ldr q28, [x17, #+288] -ldr q29, [x17, #+304] -ldr q27, [x17, #+320] -ldr q26, [x17, #+336] -ldr q25, [x17, #+352] -ldr q4, [x17, #+368] -ldr q12, [x0, #96] -ldr q3, [x0, #112] -ldr q15, [x0, #64] -ldr q22, [x0, #80] -sqrdmulh v21.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v24.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v24.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v22.4s -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v24.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -trn1 v12.4S, v15.4S, v3.4S -trn2 v10.4S, v15.4S, v3.4S -trn1 v1.4S, v21.4S, v22.4S -trn2 v18.4S, v21.4S, v22.4S -trn2 v21.2D, v12.2D, v1.2D -trn2 v22.2D, v10.2D, v18.2D -trn1 v15.2D, v12.2D, v1.2D -trn1 v3.2D, v10.2D, v18.2D -sqrdmulh v18.4S, v21.4S, v29.4S -mul v21.4S, v21.4S,v28.4S -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.4S -mul v22.4S, v22.4S,v28.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v26.4S -mul v3.4S, v3.4S,v27.4S -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v3.4s -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v4.4S -mul v21.4S, v21.4S,v25.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -str q15, [x0, #64] -str q22, [x0, #80] -str q18, [x0, #96] -str q3, [x0, #112] -ldr q3, [x17, #+384] -ldr q18, [x17, #+400] -ldr q22, [x17, #+416] -ldr q15, [x17, #+432] -ldr q21, [x17, #+448] -ldr q10, [x17, #+464] -ldr q1, [x17, #+480] -ldr q12, [x17, #+496] -ldr q4, [x0, #160] -ldr q25, [x0, #176] -ldr q26, [x0, #128] -ldr q27, [x0, #144] -sqrdmulh v29.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v18.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v18.s[1] -mul v27.4S, v27.4S,v3.s[1] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v4.4S, v18.s[2] -mul v4.4S, v4.4S,v3.s[2] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -trn1 v4.4S, v26.4S, v25.4S -trn2 v28.4S, v26.4S, v25.4S -trn1 v24.4S, v29.4S, v27.4S -trn2 v30.4S, v29.4S, v27.4S -trn2 v29.2D, v4.2D, v24.2D -trn2 v27.2D, v28.2D, v30.2D -trn1 v26.2D, v4.2D, v24.2D -trn1 v25.2D, v28.2D, v30.2D -sqrdmulh v30.4S, v29.4S, v15.4S -mul v29.4S, v29.4S,v22.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v15.4S -mul v27.4S, v27.4S,v22.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v27.4s -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v10.4S -mul v25.4S, v25.4S,v21.4S -mla v25.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v1.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -str q26, [x0, #128] -str q27, [x0, #144] -str q30, [x0, #160] -str q25, [x0, #176] -ldr q25, [x17, #+512] -ldr q30, [x17, #+528] -ldr q27, [x17, #+544] -ldr q26, [x17, #+560] -ldr q29, [x17, #+576] -ldr q28, [x17, #+592] -ldr q24, [x17, #+608] -ldr q4, [x17, #+624] -ldr q12, [x0, #224] -ldr q1, [x0, #240] -ldr q10, [x0, #192] -ldr q21, [x0, #208] -sqrdmulh v15.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v30.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v30.s[1] -mul v21.4S, v21.4S,v25.s[1] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v30.s[2] -mul v12.4S, v12.4S,v25.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -trn1 v12.4S, v10.4S, v1.4S -trn2 v22.4S, v10.4S, v1.4S -trn1 v18.4S, v15.4S, v21.4S -trn2 v3.4S, v15.4S, v21.4S -trn2 v15.2D, v12.2D, v18.2D -trn2 v21.2D, v22.2D, v3.2D -trn1 v10.2D, v12.2D, v18.2D -trn1 v1.2D, v22.2D, v3.2D -sqrdmulh v3.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v27.4S -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v26.4S -mul v21.4S, v21.4S,v27.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v28.4S -mul v1.4S, v1.4S,v29.4S -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v4.4S -mul v15.4S, v15.4S,v24.4S -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -str q10, [x0, #192] -str q21, [x0, #208] -str q3, [x0, #224] -str q1, [x0, #240] -ldr q1, [x17, #+640] -ldr q3, [x17, #+656] -ldr q21, [x17, #+672] -ldr q10, [x17, #+688] -ldr q15, [x17, #+704] -ldr q22, [x17, #+720] -ldr q18, [x17, #+736] -ldr q12, [x17, #+752] -ldr q4, [x0, #288] -ldr q24, [x0, #304] -ldr q28, [x0, #256] -ldr q29, [x0, #272] -sqrdmulh v26.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v1.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -sqrdmulh v4.4S, v24.4S, v3.s[0] -mul v24.4S, v24.4S,v1.s[0] -mla v24.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v3.s[1] -mul v29.4S, v29.4S,v1.s[1] -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v3.s[2] -mul v4.4S, v4.4S,v1.s[2] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -trn1 v4.4S, v28.4S, v24.4S -trn2 v27.4S, v28.4S, v24.4S -trn1 v30.4S, v26.4S, v29.4S -trn2 v25.4S, v26.4S, v29.4S -trn2 v26.2D, v4.2D, v30.2D -trn2 v29.2D, v27.2D, v25.2D -trn1 v28.2D, v4.2D, v30.2D -trn1 v24.2D, v27.2D, v25.2D -sqrdmulh v25.4S, v26.4S, v10.4S -mul v26.4S, v26.4S,v21.4S -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v10.4S -mul v29.4S, v29.4S,v21.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v29.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v22.4S -mul v24.4S, v24.4S,v15.4S -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v26.4S, v12.4S -mul v26.4S, v26.4S,v18.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -str q28, [x0, #256] -str q29, [x0, #272] -str q25, [x0, #288] -str q24, [x0, #304] -ldr q24, [x17, #+768] -ldr q25, [x17, #+784] -ldr q29, [x17, #+800] -ldr q28, [x17, #+816] -ldr q26, [x17, #+832] -ldr q27, [x17, #+848] -ldr q30, [x17, #+864] -ldr q4, [x17, #+880] -ldr q12, [x0, #352] -ldr q18, [x0, #368] -ldr q22, [x0, #320] -ldr q15, [x0, #336] -sqrdmulh v10.4S, v12.4S, v25.s[0] -mul v12.4S, v12.4S,v24.s[0] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v24.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v25.s[2] -mul v12.4S, v12.4S,v24.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -trn1 v12.4S, v22.4S, v18.4S -trn2 v21.4S, v22.4S, v18.4S -trn1 v3.4S, v10.4S, v15.4S -trn2 v1.4S, v10.4S, v15.4S -trn2 v10.2D, v12.2D, v3.2D -trn2 v15.2D, v21.2D, v1.2D -trn1 v22.2D, v12.2D, v3.2D -trn1 v18.2D, v21.2D, v1.2D -sqrdmulh v1.4S, v10.4S, v28.4S -mul v10.4S, v10.4S,v29.4S -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v28.4S -mul v15.4S, v15.4S,v29.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v27.4S -mul v18.4S, v18.4S,v26.4S -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v4.4S -mul v10.4S, v10.4S,v30.4S -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v10.4s -add v1.4s, v1.4s, v10.4s -str q22, [x0, #320] -str q15, [x0, #336] -str q1, [x0, #352] -str q18, [x0, #368] -ldr q18, [x17, #+896] -ldr q1, [x17, #+912] -ldr q15, [x17, #+928] -ldr q22, [x17, #+944] -ldr q10, [x17, #+960] -ldr q21, [x17, #+976] -ldr q3, [x17, #+992] -ldr q12, [x17, #+1008] -ldr q4, [x0, #416] -ldr q30, [x0, #432] -ldr q27, [x0, #384] -ldr q26, [x0, #400] -sqrdmulh v28.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v1.s[1] -mul v26.4S, v26.4S,v18.s[1] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v18.s[2] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -trn1 v4.4S, v27.4S, v30.4S -trn2 v29.4S, v27.4S, v30.4S -trn1 v25.4S, v28.4S, v26.4S -trn2 v24.4S, v28.4S, v26.4S -trn2 v28.2D, v4.2D, v25.2D -trn2 v26.2D, v29.2D, v24.2D -trn1 v27.2D, v4.2D, v25.2D -trn1 v30.2D, v29.2D, v24.2D -sqrdmulh v24.4S, v28.4S, v22.4S -mul v28.4S, v28.4S,v15.4S -mla v28.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v22.4S -mul v26.4S, v26.4S,v15.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v21.4S -mul v30.4S, v30.4S,v10.4S -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v12.4S -mul v28.4S, v28.4S,v3.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -str q27, [x0, #384] -str q26, [x0, #400] -str q24, [x0, #416] -str q30, [x0, #432] -ldr q30, [x17, #+1024] -ldr q24, [x17, #+1040] -ldr q26, [x17, #+1056] -ldr q27, [x17, #+1072] -ldr q28, [x17, #+1088] -ldr q29, [x17, #+1104] -ldr q25, [x17, #+1120] -ldr q4, [x17, #+1136] -ldr q12, [x0, #480] -ldr q3, [x0, #496] -ldr q21, [x0, #448] -ldr q10, [x0, #464] -sqrdmulh v22.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v24.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v3.4s -add v10.4s, v10.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v24.s[1] -mul v10.4S, v10.4S,v30.s[1] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v12.4S, v24.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -trn1 v12.4S, v21.4S, v3.4S -trn2 v15.4S, v21.4S, v3.4S -trn1 v1.4S, v22.4S, v10.4S -trn2 v18.4S, v22.4S, v10.4S -trn2 v22.2D, v12.2D, v1.2D -trn2 v10.2D, v15.2D, v18.2D -trn1 v21.2D, v12.2D, v1.2D -trn1 v3.2D, v15.2D, v18.2D -sqrdmulh v18.4S, v22.4S, v27.4S -mul v22.4S, v22.4S,v26.4S -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v27.4S -mul v10.4S, v10.4S,v26.4S -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v3.4s, v10.4s -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.4S -mul v3.4S, v3.4S,v28.4S -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v4.4S -mul v22.4S, v22.4S,v25.4S -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -str q21, [x0, #448] -str q10, [x0, #464] -str q18, [x0, #480] -str q3, [x0, #496] -ldr q3, [x17, #+1152] -ldr q18, [x17, #+1168] -ldr q10, [x17, #+1184] -ldr q21, [x17, #+1200] -ldr q22, [x17, #+1216] -ldr q15, [x17, #+1232] -ldr q1, [x17, #+1248] -ldr q12, [x17, #+1264] -ldr q4, [x0, #544] -ldr q25, [x0, #560] -ldr q29, [x0, #512] -ldr q28, [x0, #528] -sqrdmulh v27.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v18.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v18.s[1] -mul v28.4S, v28.4S,v3.s[1] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v18.s[2] -mul v4.4S, v4.4S,v3.s[2] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -trn1 v4.4S, v29.4S, v25.4S -trn2 v26.4S, v29.4S, v25.4S -trn1 v24.4S, v27.4S, v28.4S -trn2 v30.4S, v27.4S, v28.4S -trn2 v27.2D, v4.2D, v24.2D -trn2 v28.2D, v26.2D, v30.2D -trn1 v29.2D, v4.2D, v24.2D -trn1 v25.2D, v26.2D, v30.2D -sqrdmulh v30.4S, v27.4S, v21.4S -mul v27.4S, v27.4S,v10.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v21.4S -mul v28.4S, v28.4S,v10.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v25.4S, v15.4S -mul v25.4S, v25.4S,v22.4S -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v25.4s -add v29.4s, v29.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v12.4S -mul v27.4S, v27.4S,v1.4S -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -str q29, [x0, #512] -str q28, [x0, #528] -str q30, [x0, #544] -str q25, [x0, #560] -ldr q25, [x17, #+1280] -ldr q30, [x17, #+1296] -ldr q28, [x17, #+1312] -ldr q29, [x17, #+1328] -ldr q27, [x17, #+1344] -ldr q26, [x17, #+1360] -ldr q24, [x17, #+1376] -ldr q4, [x17, #+1392] -ldr q12, [x0, #608] -ldr q1, [x0, #624] -ldr q15, [x0, #576] -ldr q22, [x0, #592] -sqrdmulh v21.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v30.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v30.s[1] -mul v22.4S, v22.4S,v25.s[1] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v15.4s, v22.4s -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v30.s[2] -mul v12.4S, v12.4S,v25.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -trn1 v12.4S, v15.4S, v1.4S -trn2 v10.4S, v15.4S, v1.4S -trn1 v18.4S, v21.4S, v22.4S -trn2 v3.4S, v21.4S, v22.4S -trn2 v21.2D, v12.2D, v18.2D -trn2 v22.2D, v10.2D, v3.2D -trn1 v15.2D, v12.2D, v18.2D -trn1 v1.2D, v10.2D, v3.2D -sqrdmulh v3.4S, v21.4S, v29.4S -mul v21.4S, v21.4S,v28.4S -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.4S -mul v22.4S, v22.4S,v28.4S -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v26.4S -mul v1.4S, v1.4S,v27.4S -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v4.4S -mul v21.4S, v21.4S,v24.4S -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -str q15, [x0, #576] -str q22, [x0, #592] -str q3, [x0, #608] -str q1, [x0, #624] -ldr q1, [x17, #+1408] -ldr q3, [x17, #+1424] -ldr q22, [x17, #+1440] -ldr q15, [x17, #+1456] -ldr q21, [x17, #+1472] -ldr q10, [x17, #+1488] -ldr q18, [x17, #+1504] -ldr q12, [x17, #+1520] -ldr q4, [x0, #672] -ldr q24, [x0, #688] -ldr q26, [x0, #640] -ldr q27, [x0, #656] -sqrdmulh v29.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v1.s[0] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -sqrdmulh v4.4S, v24.4S, v3.s[0] -mul v24.4S, v24.4S,v1.s[0] -mla v24.4S, v4.4S, v31.s[0] -sub v4.4s, v27.4s, v24.4s -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v27.4S, v3.s[1] -mul v27.4S, v27.4S,v1.s[1] -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v4.4S, v3.s[2] -mul v4.4S, v4.4S,v1.s[2] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -trn1 v4.4S, v26.4S, v24.4S -trn2 v28.4S, v26.4S, v24.4S -trn1 v30.4S, v29.4S, v27.4S -trn2 v25.4S, v29.4S, v27.4S -trn2 v29.2D, v4.2D, v30.2D -trn2 v27.2D, v28.2D, v25.2D -trn1 v26.2D, v4.2D, v30.2D -trn1 v24.2D, v28.2D, v25.2D -sqrdmulh v25.4S, v29.4S, v15.4S -mul v29.4S, v29.4S,v22.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v15.4S -mul v27.4S, v27.4S,v22.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v10.4S -mul v24.4S, v24.4S,v21.4S -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v12.4S -mul v29.4S, v29.4S,v18.4S -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -str q26, [x0, #640] -str q27, [x0, #656] -str q25, [x0, #672] -str q24, [x0, #688] -ldr q24, [x17, #+1536] -ldr q25, [x17, #+1552] -ldr q27, [x17, #+1568] -ldr q26, [x17, #+1584] -ldr q29, [x17, #+1600] -ldr q28, [x17, #+1616] -ldr q30, [x17, #+1632] -ldr q4, [x17, #+1648] -ldr q12, [x0, #736] -ldr q18, [x0, #752] -ldr q10, [x0, #704] -ldr q21, [x0, #720] -sqrdmulh v15.4S, v12.4S, v25.s[0] -mul v12.4S, v12.4S,v24.s[0] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v24.s[1] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v25.s[2] -mul v12.4S, v12.4S,v24.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -trn1 v12.4S, v10.4S, v18.4S -trn2 v22.4S, v10.4S, v18.4S -trn1 v3.4S, v15.4S, v21.4S -trn2 v1.4S, v15.4S, v21.4S -trn2 v15.2D, v12.2D, v3.2D -trn2 v21.2D, v22.2D, v1.2D -trn1 v10.2D, v12.2D, v3.2D -trn1 v18.2D, v22.2D, v1.2D -sqrdmulh v1.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v27.4S -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v26.4S -mul v21.4S, v21.4S,v27.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v28.4S -mul v18.4S, v18.4S,v29.4S -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v4.4S -mul v15.4S, v15.4S,v30.4S -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -str q10, [x0, #704] -str q21, [x0, #720] -str q1, [x0, #736] -str q18, [x0, #752] -ldr q18, [x17, #+1664] -ldr q1, [x17, #+1680] -ldr q21, [x17, #+1696] -ldr q10, [x17, #+1712] -ldr q15, [x17, #+1728] -ldr q22, [x17, #+1744] -ldr q3, [x17, #+1760] -ldr q12, [x17, #+1776] -ldr q4, [x0, #800] -ldr q30, [x0, #816] -ldr q28, [x0, #768] -ldr q29, [x0, #784] -sqrdmulh v26.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v1.s[1] -mul v29.4S, v29.4S,v18.s[1] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v18.s[2] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v4.4s -add v26.4s, v26.4s, v4.4s -trn1 v4.4S, v28.4S, v30.4S -trn2 v27.4S, v28.4S, v30.4S -trn1 v25.4S, v26.4S, v29.4S -trn2 v24.4S, v26.4S, v29.4S -trn2 v26.2D, v4.2D, v25.2D -trn2 v29.2D, v27.2D, v24.2D -trn1 v28.2D, v4.2D, v25.2D -trn1 v30.2D, v27.2D, v24.2D -sqrdmulh v24.4S, v26.4S, v10.4S -mul v26.4S, v26.4S,v21.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v10.4S -mul v29.4S, v29.4S,v21.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v22.4S -mul v30.4S, v30.4S,v15.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v12.4S -mul v26.4S, v26.4S,v3.4S -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -str q28, [x0, #768] -str q29, [x0, #784] -str q24, [x0, #800] -str q30, [x0, #816] -ldr q30, [x17, #+1792] -ldr q24, [x17, #+1808] -ldr q29, [x17, #+1824] -ldr q28, [x17, #+1840] -ldr q26, [x17, #+1856] -ldr q27, [x17, #+1872] -ldr q25, [x17, #+1888] -ldr q4, [x17, #+1904] -ldr q12, [x0, #864] -ldr q3, [x0, #880] -ldr q22, [x0, #832] -ldr q15, [x0, #848] -sqrdmulh v10.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v24.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v3.4s -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v24.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v24.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -trn1 v12.4S, v22.4S, v3.4S -trn2 v21.4S, v22.4S, v3.4S -trn1 v1.4S, v10.4S, v15.4S -trn2 v18.4S, v10.4S, v15.4S -trn2 v10.2D, v12.2D, v1.2D -trn2 v15.2D, v21.2D, v18.2D -trn1 v22.2D, v12.2D, v1.2D -trn1 v3.2D, v21.2D, v18.2D -sqrdmulh v18.4S, v10.4S, v28.4S -mul v10.4S, v10.4S,v29.4S -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v28.4S -mul v15.4S, v15.4S,v29.4S -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v27.4S -mul v3.4S, v3.4S,v26.4S -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v4.4S -mul v10.4S, v10.4S,v25.4S -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -str q22, [x0, #832] -str q15, [x0, #848] -str q18, [x0, #864] -str q3, [x0, #880] -ldr q3, [x17, #+1920] -ldr q18, [x17, #+1936] -ldr q15, [x17, #+1952] -ldr q22, [x17, #+1968] -ldr q10, [x17, #+1984] -ldr q21, [x17, #+2000] -ldr q1, [x17, #+2016] -ldr q12, [x17, #+2032] -ldr q4, [x0, #928] -ldr q25, [x0, #944] -ldr q27, [x0, #896] -ldr q26, [x0, #912] -sqrdmulh v28.4S, v4.4S, v18.s[0] -mul v4.4S, v4.4S,v3.s[0] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v18.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v18.s[1] -mul v26.4S, v26.4S,v3.s[1] -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v4.4S, v18.s[2] -mul v4.4S, v4.4S,v3.s[2] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -trn1 v4.4S, v27.4S, v25.4S -trn2 v29.4S, v27.4S, v25.4S -trn1 v24.4S, v28.4S, v26.4S -trn2 v30.4S, v28.4S, v26.4S -trn2 v28.2D, v4.2D, v24.2D -trn2 v26.2D, v29.2D, v30.2D -trn1 v27.2D, v4.2D, v24.2D -trn1 v25.2D, v29.2D, v30.2D -sqrdmulh v30.4S, v28.4S, v22.4S -mul v28.4S, v28.4S,v15.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v22.4S -mul v26.4S, v26.4S,v15.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v21.4S -mul v25.4S, v25.4S,v10.4S -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v12.4S -mul v28.4S, v28.4S,v1.4S -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -str q27, [x0, #896] -str q26, [x0, #912] -str q30, [x0, #928] -str q25, [x0, #944] -ldr q25, [x17, #+2048] -ldr q30, [x17, #+2064] -ldr q26, [x17, #+2080] -ldr q27, [x17, #+2096] -ldr q28, [x17, #+2112] -ldr q29, [x17, #+2128] -ldr q24, [x17, #+2144] -ldr q4, [x17, #+2160] -ldr q12, [x0, #992] -ldr q1, [x0, #1008] -ldr q21, [x0, #960] -ldr q10, [x0, #976] -sqrdmulh v22.4S, v12.4S, v30.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v30.s[0] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v30.s[1] -mul v10.4S, v10.4S,v25.s[1] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v12.4S, v30.s[2] -mul v12.4S, v12.4S,v25.s[2] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -trn1 v12.4S, v21.4S, v1.4S -trn2 v15.4S, v21.4S, v1.4S -trn1 v18.4S, v22.4S, v10.4S -trn2 v3.4S, v22.4S, v10.4S -trn2 v22.2D, v12.2D, v18.2D -trn2 v10.2D, v15.2D, v3.2D -trn1 v21.2D, v12.2D, v18.2D -trn1 v1.2D, v15.2D, v3.2D -sqrdmulh v3.4S, v22.4S, v27.4S -mul v22.4S, v22.4S,v26.4S -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v27.4S -mul v10.4S, v10.4S,v26.4S -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v10.4s -add v1.4s, v1.4s, v10.4s -sqrdmulh v10.4S, v1.4S, v29.4S -mul v1.4S, v1.4S,v28.4S -mla v1.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v4.4S -mul v22.4S, v22.4S,v24.4S -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -str q21, [x0, #960] -str q10, [x0, #976] -str q3, [x0, #992] -str q1, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s deleted file mode 100644 index 761b80d..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_8_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_8_0 -.global _ntt_u32_full_neon_asm_var_4_4_8_0 -ntt_u32_full_neon_asm_var_4_4_8_0: -_ntt_u32_full_neon_asm_var_4_4_8_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -sub v15.4s, v1.4s, v22.4s -mul v2.4S, v2.4S,v30.s[0] -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -sub v12.4s, v14.4s, v20.4s -mul v22.4S, v22.4S,v30.s[0] -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v18.4s -mla v19.4S, v21.4S, v31.s[0] -mla v2.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mla v22.4S, v13.4S, v31.s[0] -ldr q13, [x0, #352] -sqrdmulh v18.4S, v1.4S, v29.s[1] -sub v0.4s, v13.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -sqrdmulh v21.4S, v14.4S, v29.s[1] -add v13.4s, v13.4s, v16.4s -mul v14.4S, v14.4S,v30.s[1] -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -sub v10.4s, v16.4s, v3.4s -mul v20.4S, v20.4S,v30.s[1] -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v13.4S, v29.s[1] -sub v8.4s, v3.4s, v19.4s -mul v13.4S, v13.4S,v30.s[1] -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v2.4s -mla v14.4S, v21.4S, v31.s[0] -mla v20.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -mla v13.4S, v9.4S, v31.s[0] -ldr q9, [x0, #224] -sqrdmulh v2.4S, v15.4S, v29.s[2] -sub v11.4s, v9.4s, v22.4s -mul v15.4S, v15.4S,v30.s[2] -sqrdmulh v21.4S, v12.4S, v29.s[2] -add v9.4s, v9.4s, v22.4s -mul v12.4S, v12.4S,v30.s[2] -sqrdmulh v22.4S, v17.4S, v29.s[2] -sub v7.4s, v19.4s, v1.4s -mul v17.4S, v17.4S,v30.s[2] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -sub v6.4s, v9.4s, v14.4s -mul v0.4S, v0.4S,v30.s[2] -add v9.4s, v9.4s, v14.4s -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v20.4s -mla v12.4S, v21.4S, v31.s[0] -mla v17.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -mla v0.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v7.4S, v27.s[1] -sub v20.4s, v3.4s, v13.4s -mul v7.4S, v7.4S,v28.s[1] -sqrdmulh v22.4S, v6.4S, v27.s[1] -add v3.4s, v3.4s, v13.4s -mul v6.4S, v6.4S,v28.s[1] -sqrdmulh v13.4S, v19.4S, v27.s[0] -sub v21.4s, v18.4s, v15.4s -mul v19.4S, v19.4S,v28.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v9.4S, v27.s[0] -sub v14.4s, v11.4s, v12.4s -mul v9.4S, v9.4S,v28.s[0] -add v11.4s, v11.4s, v12.4s -mla v7.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v17.4s -mla v6.4S, v22.4S, v31.s[0] -mla v19.4S, v13.4S, v31.s[0] -add v10.4s, v10.4s, v17.4s -mla v9.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v18.4S, v27.s[2] -sub v17.4s, v8.4s, v0.4s -mul v18.4S, v18.4S,v28.s[2] -sqrdmulh v13.4S, v11.4S, v27.s[2] -add v8.4s, v8.4s, v0.4s -mul v11.4S, v11.4S,v28.s[2] -sqrdmulh v0.4S, v21.4S, v27.s[3] -sub v22.4s, v2.4s, v7.4s -mul v21.4S, v21.4S,v28.s[3] -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -sub v12.4s, v20.4s, v6.4s -mul v14.4S, v14.4S,v28.s[3] -add v20.4s, v20.4s, v6.4s -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v19.4s -mla v11.4S, v13.4S, v31.s[0] -mla v21.4S, v0.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -mla v14.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v20.4S, v25.s[2] -sub v19.4s, v3.4s, v9.4s -mul v20.4S, v20.4S,v26.s[2] -sqrdmulh v0.4S, v12.4S, v25.s[3] -add v3.4s, v3.4s, v9.4s -mul v12.4S, v12.4S,v26.s[3] -sqrdmulh v9.4S, v19.4S, v25.s[1] -sub v13.4s, v10.4s, v18.4s -mul v19.4S, v19.4S,v26.s[1] -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v25.s[0] -sub v6.4s, v8.4s, v11.4s -mul v3.4S, v3.4S,v26.s[0] -add v8.4s, v8.4s, v11.4s -mla v20.4S, v7.4S, v31.s[0] -sub v7.4s, v1.4s, v21.4s -mla v12.4S, v0.4S, v31.s[0] -mla v19.4S, v9.4S, v31.s[0] -add v1.4s, v1.4s, v21.4s -mla v3.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v8.4S, v23.s[0] -sub v21.4s, v17.4s, v14.4s -mul v8.4S, v8.4S,v24.s[0] -sqrdmulh v9.4S, v6.4S, v23.s[1] -add v17.4s, v17.4s, v14.4s -mul v6.4S, v6.4S,v24.s[1] -sqrdmulh v14.4S, v17.4S, v23.s[2] -sub v0.4s, v2.4s, v20.4s -mul v17.4S, v17.4S,v24.s[2] -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v23.s[3] -sub v11.4s, v22.4s, v12.4s -mul v21.4S, v21.4S,v24.s[3] -add v22.4s, v22.4s, v12.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v19.4s -mla v6.4S, v9.4S, v31.s[0] -str q2, [x0, #288] -mla v17.4S, v14.4S, v31.s[0] -add v15.4s, v15.4s, v19.4s -mla v21.4S, v20.4S, v31.s[0] -str q0, [x0, #352] -ldr q0, [x0, #944] -sqrdmulh v20.4S, v0.4S, v29.s[0] -sub v19.4s, v16.4s, v3.4s -mul v0.4S, v0.4S,v30.s[0] -str q22, [x0, #416] -ldr q22, [x0, #1008] -sqrdmulh v14.4S, v22.4S, v29.s[0] -add v16.4s, v16.4s, v3.4s -mul v22.4S, v22.4S,v30.s[0] -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -sub v2.4s, v10.4s, v8.4s -mul v11.4S, v11.4S,v30.s[0] -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -sub v12.4s, v13.4s, v6.4s -mul v8.4S, v8.4S,v30.s[0] -add v13.4s, v13.4s, v6.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v1.4s, v17.4s -mla v22.4S, v14.4S, v31.s[0] -str q15, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -mla v8.4S, v9.4S, v31.s[0] -str q18, [x0, #224] -ldr q18, [x0, #560] -sqrdmulh v9.4S, v18.4S, v29.s[0] -sub v17.4s, v7.4s, v21.4s -mul v18.4S, v18.4S,v30.s[0] -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -add v7.4s, v7.4s, v21.4s -mul v16.4S, v16.4S,v30.s[0] -str q19, [x0, #96] -ldr q19, [x0, #688] -ldr q21, [x0, #432] -sqrdmulh v15.4S, v19.4S, v29.s[0] -sub v14.4s, v21.4s, v0.4s -mul v19.4S, v19.4S,v30.s[0] -add v21.4s, v21.4s, v0.4s -ldr q0, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v0.4S, v29.s[0] -sub v4.4s, v6.4s, v22.4s -mul v0.4S, v0.4S,v30.s[0] -add v6.4s, v6.4s, v22.4s -ldr q22, [x0, #304] -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v11.4s -mla v16.4S, v3.4S, v31.s[0] -str q10, [x0, #544] -mla v19.4S, v15.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -mla v0.4S, v5.4S, v31.s[0] -str q2, [x0, #608] -ldr q2, [x0, #368] -sqrdmulh v5.4S, v21.4S, v29.s[1] -sub v11.4s, v2.4s, v8.4s -mul v21.4S, v21.4S,v30.s[1] -str q13, [x0, #672] -sqrdmulh v13.4S, v6.4S, v29.s[1] -add v2.4s, v2.4s, v8.4s -mul v6.4S, v6.4S,v30.s[1] -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v22.4S, v29.s[1] -sub v15.4s, v12.4s, v18.4s -mul v22.4S, v22.4S,v30.s[1] -add v12.4s, v12.4s, v18.4s -ldr q18, [x0, #112] -sqrdmulh v10.4S, v2.4S, v29.s[1] -sub v3.4s, v18.4s, v16.4s -mul v2.4S, v2.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #176] -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v19.4s -mla v6.4S, v13.4S, v31.s[0] -str q1, [x0, #800] -mla v22.4S, v8.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -mla v2.4S, v10.4S, v31.s[0] -str q20, [x0, #864] -ldr q20, [x0, #240] -sqrdmulh v10.4S, v14.4S, v29.s[2] -sub v19.4s, v20.4s, v0.4s -mul v14.4S, v14.4S,v30.s[2] -str q7, [x0, #928] -sqrdmulh v7.4S, v4.4S, v29.s[2] -add v20.4s, v20.4s, v0.4s -mul v4.4S, v4.4S,v30.s[2] -str q17, [x0, #992] -sqrdmulh v17.4S, v9.4S, v29.s[2] -sub v0.4s, v16.4s, v21.4s -mul v9.4S, v9.4S,v30.s[2] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v29.s[2] -sub v8.4s, v20.4s, v6.4s -mul v11.4S, v11.4S,v30.s[2] -add v20.4s, v20.4s, v6.4s -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v12.4s, v22.4s -mla v4.4S, v7.4S, v31.s[0] -mla v9.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -mla v11.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v0.4S, v27.s[1] -sub v22.4s, v18.4s, v2.4s -mul v0.4S, v0.4S,v28.s[1] -sqrdmulh v17.4S, v8.4S, v27.s[1] -add v18.4s, v18.4s, v2.4s -mul v8.4S, v8.4S,v28.s[1] -sqrdmulh v2.4S, v16.4S, v27.s[0] -sub v7.4s, v5.4s, v14.4s -mul v16.4S, v16.4S,v28.s[0] -add v5.4s, v5.4s, v14.4s -sqrdmulh v14.4S, v20.4S, v27.s[0] -sub v6.4s, v19.4s, v4.4s -mul v20.4S, v20.4S,v28.s[0] -add v19.4s, v19.4s, v4.4s -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v9.4s -mla v8.4S, v17.4S, v31.s[0] -mla v16.4S, v2.4S, v31.s[0] -add v15.4s, v15.4s, v9.4s -mla v20.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v5.4S, v27.s[2] -sub v9.4s, v3.4s, v11.4s -mul v5.4S, v5.4S,v28.s[2] -sqrdmulh v2.4S, v19.4S, v27.s[2] -add v3.4s, v3.4s, v11.4s -mul v19.4S, v19.4S,v28.s[2] -sqrdmulh v11.4S, v7.4S, v27.s[3] -sub v17.4s, v10.4s, v0.4s -mul v7.4S, v7.4S,v28.s[3] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v6.4S, v27.s[3] -sub v4.4s, v22.4s, v8.4s -mul v6.4S, v6.4S,v28.s[3] -add v22.4s, v22.4s, v8.4s -mla v5.4S, v14.4S, v31.s[0] -sub v14.4s, v12.4s, v16.4s -mla v19.4S, v2.4S, v31.s[0] -mla v7.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -mla v6.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v22.4S, v25.s[2] -sub v16.4s, v18.4s, v20.4s -mul v22.4S, v22.4S,v26.s[2] -sqrdmulh v11.4S, v4.4S, v25.s[3] -add v18.4s, v18.4s, v20.4s -mul v4.4S, v4.4S,v26.s[3] -sqrdmulh v20.4S, v16.4S, v25.s[1] -sub v2.4s, v15.4s, v5.4s -mul v16.4S, v16.4S,v26.s[1] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v25.s[0] -sub v8.4s, v3.4s, v19.4s -mul v18.4S, v18.4S,v26.s[0] -add v3.4s, v3.4s, v19.4s -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v7.4s -mla v4.4S, v11.4S, v31.s[0] -mla v16.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v7.4s -mla v18.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v3.4S, v23.s[0] -sub v7.4s, v9.4s, v6.4s -mul v3.4S, v3.4S,v24.s[0] -sqrdmulh v20.4S, v8.4S, v23.s[1] -add v9.4s, v9.4s, v6.4s -mul v8.4S, v8.4S,v24.s[1] -sqrdmulh v6.4S, v9.4S, v23.s[2] -sub v11.4s, v10.4s, v22.4s -mul v9.4S, v9.4S,v24.s[2] -add v10.4s, v10.4s, v22.4s -sqrdmulh v22.4S, v7.4S, v23.s[3] -sub v19.4s, v17.4s, v4.4s -mul v7.4S, v7.4S,v24.s[3] -add v17.4s, v17.4s, v4.4s -mla v3.4S, v5.4S, v31.s[0] -sub v5.4s, v14.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -str q10, [x0, #304] -mla v9.4S, v6.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -mla v7.4S, v22.4S, v31.s[0] -str q11, [x0, #368] -ldr q11, [x0, #896] -sqrdmulh v22.4S, v11.4S, v29.s[0] -sub v16.4s, v12.4s, v18.4s -mul v11.4S, v11.4S,v30.s[0] -str q17, [x0, #432] -ldr q17, [x0, #960] -sqrdmulh v6.4S, v17.4S, v29.s[0] -add v12.4s, v12.4s, v18.4s -mul v17.4S, v17.4S,v30.s[0] -str q19, [x0, #496] -ldr q19, [x0, #768] -sqrdmulh v18.4S, v19.4S, v29.s[0] -sub v10.4s, v15.4s, v3.4s -mul v19.4S, v19.4S,v30.s[0] -add v15.4s, v15.4s, v3.4s -ldr q3, [x0, #832] -sqrdmulh v20.4S, v3.4S, v29.s[0] -sub v4.4s, v2.4s, v8.4s -mul v3.4S, v3.4S,v30.s[0] -add v2.4s, v2.4s, v8.4s -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v9.4s -mla v17.4S, v6.4S, v31.s[0] -str q14, [x0, #176] -mla v19.4S, v18.4S, v31.s[0] -add v21.4s, v21.4s, v9.4s -mla v3.4S, v20.4S, v31.s[0] -str q5, [x0, #240] -ldr q5, [x0, #512] -sqrdmulh v20.4S, v5.4S, v29.s[0] -sub v9.4s, v0.4s, v7.4s -mul v5.4S, v5.4S,v30.s[0] -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v18.4S, v12.4S, v29.s[0] -add v0.4s, v0.4s, v7.4s -mul v12.4S, v12.4S,v30.s[0] -str q16, [x0, #112] -ldr q16, [x0, #640] -ldr q7, [x0, #384] -sqrdmulh v14.4S, v16.4S, v29.s[0] -sub v6.4s, v7.4s, v11.4s -mul v16.4S, v16.4S,v30.s[0] -add v7.4s, v7.4s, v11.4s -ldr q11, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v1.4S, v11.4S, v29.s[0] -sub v13.4s, v8.4s, v17.4s -mul v11.4S, v11.4S,v30.s[0] -add v8.4s, v8.4s, v17.4s -ldr q17, [x0, #256] -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v19.4s -mla v12.4S, v18.4S, v31.s[0] -str q15, [x0, #560] -mla v16.4S, v14.4S, v31.s[0] -add v17.4s, v17.4s, v19.4s -mla v11.4S, v1.4S, v31.s[0] -str q10, [x0, #624] -ldr q10, [x0, #320] -sqrdmulh v1.4S, v7.4S, v29.s[1] -sub v19.4s, v10.4s, v3.4s -mul v7.4S, v7.4S,v30.s[1] -str q2, [x0, #688] -sqrdmulh v2.4S, v8.4S, v29.s[1] -add v10.4s, v10.4s, v3.4s -mul v8.4S, v8.4S,v30.s[1] -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v3.4S, v17.4S, v29.s[1] -sub v14.4s, v4.4s, v5.4s -mul v17.4S, v17.4S,v30.s[1] -add v4.4s, v4.4s, v5.4s -ldr q5, [x0, #64] -sqrdmulh v15.4S, v10.4S, v29.s[1] -sub v18.4s, v5.4s, v12.4s -mul v10.4S, v10.4S,v30.s[1] -add v5.4s, v5.4s, v12.4s -ldr q12, [x0, #128] -mla v7.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v16.4s -mla v8.4S, v2.4S, v31.s[0] -str q21, [x0, #816] -mla v17.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -mla v10.4S, v15.4S, v31.s[0] -str q22, [x0, #880] -ldr q22, [x0, #192] -sqrdmulh v15.4S, v6.4S, v29.s[2] -sub v16.4s, v22.4s, v11.4s -mul v6.4S, v6.4S,v30.s[2] -str q0, [x0, #944] -sqrdmulh v0.4S, v13.4S, v29.s[2] -add v22.4s, v22.4s, v11.4s -mul v13.4S, v13.4S,v30.s[2] -str q9, [x0, #1008] -sqrdmulh v9.4S, v20.4S, v29.s[2] -sub v11.4s, v12.4s, v7.4s -mul v20.4S, v20.4S,v30.s[2] -add v12.4s, v12.4s, v7.4s -sqrdmulh v7.4S, v19.4S, v29.s[2] -sub v3.4s, v22.4s, v8.4s -mul v19.4S, v19.4S,v30.s[2] -add v22.4s, v22.4s, v8.4s -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v17.4s -mla v13.4S, v0.4S, v31.s[0] -mla v20.4S, v9.4S, v31.s[0] -add v4.4s, v4.4s, v17.4s -mla v19.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v11.4S, v27.s[1] -sub v17.4s, v5.4s, v10.4s -mul v11.4S, v11.4S,v28.s[1] -sqrdmulh v9.4S, v3.4S, v27.s[1] -add v5.4s, v5.4s, v10.4s -mul v3.4S, v3.4S,v28.s[1] -sqrdmulh v10.4S, v12.4S, v27.s[0] -sub v0.4s, v1.4s, v6.4s -mul v12.4S, v12.4S,v28.s[0] -add v1.4s, v1.4s, v6.4s -sqrdmulh v6.4S, v22.4S, v27.s[0] -sub v8.4s, v16.4s, v13.4s -mul v22.4S, v22.4S,v28.s[0] -add v16.4s, v16.4s, v13.4s -mla v11.4S, v7.4S, v31.s[0] -sub v7.4s, v14.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v12.4S, v10.4S, v31.s[0] -add v14.4s, v14.4s, v20.4s -mla v22.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v1.4S, v27.s[2] -sub v20.4s, v18.4s, v19.4s -mul v1.4S, v1.4S,v28.s[2] -sqrdmulh v10.4S, v16.4S, v27.s[2] -add v18.4s, v18.4s, v19.4s -mul v16.4S, v16.4S,v28.s[2] -sqrdmulh v19.4S, v0.4S, v27.s[3] -sub v9.4s, v15.4s, v11.4s -mul v0.4S, v0.4S,v28.s[3] -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v27.s[3] -sub v13.4s, v17.4s, v3.4s -mul v8.4S, v8.4S,v28.s[3] -add v17.4s, v17.4s, v3.4s -mla v1.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v12.4s -mla v16.4S, v10.4S, v31.s[0] -mla v0.4S, v19.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -mla v8.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v17.4S, v25.s[2] -sub v12.4s, v5.4s, v22.4s -mul v17.4S, v17.4S,v26.s[2] -sqrdmulh v19.4S, v13.4S, v25.s[3] -add v5.4s, v5.4s, v22.4s -mul v13.4S, v13.4S,v26.s[3] -sqrdmulh v22.4S, v12.4S, v25.s[1] -sub v10.4s, v14.4s, v1.4s -mul v12.4S, v12.4S,v26.s[1] -add v14.4s, v14.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v25.s[0] -sub v3.4s, v18.4s, v16.4s -mul v5.4S, v5.4S,v26.s[0] -add v18.4s, v18.4s, v16.4s -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v7.4s, v0.4s -mla v13.4S, v19.4S, v31.s[0] -mla v12.4S, v22.4S, v31.s[0] -add v7.4s, v7.4s, v0.4s -mla v5.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v18.4S, v23.s[0] -sub v0.4s, v20.4s, v8.4s -mul v18.4S, v18.4S,v24.s[0] -sqrdmulh v22.4S, v3.4S, v23.s[1] -add v20.4s, v20.4s, v8.4s -mul v3.4S, v3.4S,v24.s[1] -sqrdmulh v8.4S, v20.4S, v23.s[2] -sub v19.4s, v15.4s, v17.4s -mul v20.4S, v20.4S,v24.s[2] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v23.s[3] -sub v16.4s, v9.4s, v13.4s -mul v0.4S, v0.4S,v24.s[3] -add v9.4s, v9.4s, v13.4s -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v12.4s -mla v3.4S, v22.4S, v31.s[0] -str q15, [x0, #256] -mla v20.4S, v8.4S, v31.s[0] -add v6.4s, v6.4s, v12.4s -mla v0.4S, v17.4S, v31.s[0] -str q19, [x0, #320] -ldr q19, [x0, #912] -sqrdmulh v17.4S, v19.4S, v29.s[0] -sub v12.4s, v4.4s, v5.4s -mul v19.4S, v19.4S,v30.s[0] -str q9, [x0, #384] -ldr q9, [x0, #976] -sqrdmulh v8.4S, v9.4S, v29.s[0] -add v4.4s, v4.4s, v5.4s -mul v9.4S, v9.4S,v30.s[0] -str q16, [x0, #448] -ldr q16, [x0, #784] -sqrdmulh v5.4S, v16.4S, v29.s[0] -sub v15.4s, v14.4s, v18.4s -mul v16.4S, v16.4S,v30.s[0] -add v14.4s, v14.4s, v18.4s -ldr q18, [x0, #848] -sqrdmulh v22.4S, v18.4S, v29.s[0] -sub v13.4s, v10.4s, v3.4s -mul v18.4S, v18.4S,v30.s[0] -add v10.4s, v10.4s, v3.4s -mla v19.4S, v17.4S, v31.s[0] -sub v17.4s, v7.4s, v20.4s -mla v9.4S, v8.4S, v31.s[0] -str q6, [x0, #128] -mla v16.4S, v5.4S, v31.s[0] -add v7.4s, v7.4s, v20.4s -mla v18.4S, v22.4S, v31.s[0] -str q1, [x0, #192] -ldr q1, [x0, #528] -sqrdmulh v22.4S, v1.4S, v29.s[0] -sub v20.4s, v11.4s, v0.4s -mul v1.4S, v1.4S,v30.s[0] -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v5.4S, v4.4S, v29.s[0] -add v11.4s, v11.4s, v0.4s -mul v4.4S, v4.4S,v30.s[0] -str q12, [x0, #64] -ldr q12, [x0, #656] -ldr q0, [x0, #400] -sqrdmulh v6.4S, v12.4S, v29.s[0] -sub v8.4s, v0.4s, v19.4s -mul v12.4S, v12.4S,v30.s[0] -add v0.4s, v0.4s, v19.4s -ldr q19, [x0, #720] -ldr q3, [x0, #464] -sqrdmulh v21.4S, v19.4S, v29.s[0] -sub v2.4s, v3.4s, v9.4s -mul v19.4S, v19.4S,v30.s[0] -add v3.4s, v3.4s, v9.4s -ldr q9, [x0, #272] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v16.4s -mla v4.4S, v5.4S, v31.s[0] -str q14, [x0, #512] -mla v12.4S, v6.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -mla v19.4S, v21.4S, v31.s[0] -str q15, [x0, #576] -ldr q15, [x0, #336] -sqrdmulh v21.4S, v0.4S, v29.s[1] -sub v16.4s, v15.4s, v18.4s -mul v0.4S, v0.4S,v30.s[1] -str q10, [x0, #640] -sqrdmulh v10.4S, v3.4S, v29.s[1] -add v15.4s, v15.4s, v18.4s -mul v3.4S, v3.4S,v30.s[1] -str q13, [x0, #704] -ldr q13, [x0, #16] -sqrdmulh v18.4S, v9.4S, v29.s[1] -sub v6.4s, v13.4s, v1.4s -mul v9.4S, v9.4S,v30.s[1] -add v13.4s, v13.4s, v1.4s -ldr q1, [x0, #80] -sqrdmulh v14.4S, v15.4S, v29.s[1] -sub v5.4s, v1.4s, v4.4s -mul v15.4S, v15.4S,v30.s[1] -add v1.4s, v1.4s, v4.4s -ldr q4, [x0, #144] -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v4.4s, v12.4s -mla v3.4S, v10.4S, v31.s[0] -str q7, [x0, #768] -mla v9.4S, v18.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -mla v15.4S, v14.4S, v31.s[0] -str q17, [x0, #832] -ldr q17, [x0, #208] -sqrdmulh v14.4S, v8.4S, v29.s[2] -sub v12.4s, v17.4s, v19.4s -mul v8.4S, v8.4S,v30.s[2] -str q11, [x0, #896] -sqrdmulh v11.4S, v2.4S, v29.s[2] -add v17.4s, v17.4s, v19.4s -mul v2.4S, v2.4S,v30.s[2] -str q20, [x0, #960] -sqrdmulh v20.4S, v22.4S, v29.s[2] -sub v19.4s, v4.4s, v0.4s -mul v22.4S, v22.4S,v30.s[2] -add v4.4s, v4.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[2] -sub v18.4s, v17.4s, v3.4s -mul v16.4S, v16.4S,v30.s[2] -add v17.4s, v17.4s, v3.4s -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v9.4s -mla v2.4S, v11.4S, v31.s[0] -mla v22.4S, v20.4S, v31.s[0] -add v13.4s, v13.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v19.4S, v27.s[1] -sub v9.4s, v1.4s, v15.4s -mul v19.4S, v19.4S,v28.s[1] -sqrdmulh v20.4S, v18.4S, v27.s[1] -add v1.4s, v1.4s, v15.4s -mul v18.4S, v18.4S,v28.s[1] -sqrdmulh v15.4S, v4.4S, v27.s[0] -sub v11.4s, v21.4s, v8.4s -mul v4.4S, v4.4S,v28.s[0] -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v17.4S, v27.s[0] -sub v3.4s, v12.4s, v2.4s -mul v17.4S, v17.4S,v28.s[0] -add v12.4s, v12.4s, v2.4s -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v6.4s, v22.4s -mla v18.4S, v20.4S, v31.s[0] -mla v4.4S, v15.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -mla v17.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v21.4S, v27.s[2] -sub v22.4s, v5.4s, v16.4s -mul v21.4S, v21.4S,v28.s[2] -sqrdmulh v15.4S, v12.4S, v27.s[2] -add v5.4s, v5.4s, v16.4s -mul v12.4S, v12.4S,v28.s[2] -sqrdmulh v16.4S, v11.4S, v27.s[3] -sub v20.4s, v14.4s, v19.4s -mul v11.4S, v11.4S,v28.s[3] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v27.s[3] -sub v2.4s, v9.4s, v18.4s -mul v3.4S, v3.4S,v28.s[3] -add v9.4s, v9.4s, v18.4s -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v13.4s, v4.4s -mla v12.4S, v15.4S, v31.s[0] -mla v11.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v4.4s -mla v3.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v9.4S, v25.s[2] -sub v4.4s, v1.4s, v17.4s -mul v9.4S, v9.4S,v26.s[2] -sqrdmulh v16.4S, v2.4S, v25.s[3] -add v1.4s, v1.4s, v17.4s -mul v2.4S, v2.4S,v26.s[3] -sqrdmulh v17.4S, v4.4S, v25.s[1] -sub v15.4s, v6.4s, v21.4s -mul v4.4S, v4.4S,v26.s[1] -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v25.s[0] -sub v18.4s, v5.4s, v12.4s -mul v1.4S, v1.4S,v26.s[0] -add v5.4s, v5.4s, v12.4s -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v11.4s -mla v2.4S, v16.4S, v31.s[0] -mla v4.4S, v17.4S, v31.s[0] -add v0.4s, v0.4s, v11.4s -mla v1.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v5.4S, v23.s[0] -sub v11.4s, v22.4s, v3.4s -mul v5.4S, v5.4S,v24.s[0] -sqrdmulh v17.4S, v18.4S, v23.s[1] -add v22.4s, v22.4s, v3.4s -mul v18.4S, v18.4S,v24.s[1] -sqrdmulh v3.4S, v22.4S, v23.s[2] -sub v16.4s, v14.4s, v9.4s -mul v22.4S, v22.4S,v24.s[2] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v23.s[3] -sub v12.4s, v20.4s, v2.4s -mul v11.4S, v11.4S,v24.s[3] -add v20.4s, v20.4s, v2.4s -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v4.4s -mla v18.4S, v17.4S, v31.s[0] -str q14, [x0, #272] -mla v22.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v4.4s -mla v11.4S, v9.4S, v31.s[0] -str q16, [x0, #336] -sub v23.4s, v13.4s, v1.4s -str q20, [x0, #400] -add v13.4s, v13.4s, v1.4s -str q12, [x0, #464] -sub v12.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sub v5.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sub v18.4s, v0.4s, v22.4s -str q8, [x0, #144] -add v0.4s, v0.4s, v22.4s -str q21, [x0, #208] -sub v21.4s, v19.4s, v11.4s -str q13, [x0, #16] -add v19.4s, v19.4s, v11.4s -str q23, [x0, #80] -str q6, [x0, #528] -str q12, [x0, #592] -str q15, [x0, #656] -str q5, [x0, #720] -str q0, [x0, #784] -str q18, [x0, #848] -str q19, [x0, #912] -str q21, [x0, #976] -ldr q10, [x17, #+128] -ldr q7, [x17, #+144] -ldr q2, [x17, #+160] -ldr q17, [x17, #+176] -ldr q14, [x17, #+192] -ldr q3, [x17, #+208] -ldr q4, [x17, #+224] -ldr q9, [x17, #+240] -ldr q16, [x0, #32] -ldr q30, [x0, #48] -ldr q29, [x0, #0] -ldr q28, [x0, #16] -sqrdmulh v27.4S, v16.4S, v7.s[0] -mul v16.4S, v16.4S,v10.s[0] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v16.4s -add v29.4s, v29.4s, v16.4s -sqrdmulh v16.4S, v30.4S, v7.s[0] -mul v30.4S, v30.4S,v10.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v7.s[1] -mul v28.4S, v28.4S,v10.s[1] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v16.4S, v7.s[2] -mul v16.4S, v16.4S,v10.s[2] -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -trn1 v16.4S, v29.4S, v30.4S -trn2 v26.4S, v29.4S, v30.4S -trn1 v25.4S, v27.4S, v28.4S -trn2 v24.4S, v27.4S, v28.4S -trn2 v27.2D, v16.2D, v25.2D -trn2 v28.2D, v26.2D, v24.2D -trn1 v29.2D, v16.2D, v25.2D -trn1 v30.2D, v26.2D, v24.2D -sqrdmulh v24.4S, v27.4S, v17.4S -mul v27.4S, v27.4S,v2.4S -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v17.4S -mul v28.4S, v28.4S,v2.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v30.4S, v3.4S -mul v30.4S, v30.4S,v14.4S -mla v30.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v9.4S -mul v27.4S, v27.4S,v4.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -str q29, [x0, #0] -str q28, [x0, #16] -str q24, [x0, #32] -str q30, [x0, #48] -ldr q30, [x17, #+256] -ldr q24, [x17, #+272] -ldr q28, [x17, #+288] -ldr q29, [x17, #+304] -ldr q27, [x17, #+320] -ldr q26, [x17, #+336] -ldr q25, [x17, #+352] -ldr q16, [x17, #+368] -ldr q9, [x0, #96] -ldr q4, [x0, #112] -ldr q3, [x0, #64] -ldr q14, [x0, #80] -sqrdmulh v17.4S, v9.4S, v24.s[0] -mul v9.4S, v9.4S,v30.s[0] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v24.s[0] -mul v4.4S, v4.4S,v30.s[0] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -sqrdmulh v4.4S, v14.4S, v24.s[1] -mul v14.4S, v14.4S,v30.s[1] -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v24.s[2] -mul v9.4S, v9.4S,v30.s[2] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -trn1 v9.4S, v3.4S, v4.4S -trn2 v2.4S, v3.4S, v4.4S -trn1 v7.4S, v17.4S, v14.4S -trn2 v10.4S, v17.4S, v14.4S -trn2 v17.2D, v9.2D, v7.2D -trn2 v14.2D, v2.2D, v10.2D -trn1 v3.2D, v9.2D, v7.2D -trn1 v4.2D, v2.2D, v10.2D -sqrdmulh v10.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v28.4S -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v29.4S -mul v14.4S, v14.4S,v28.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -sqrdmulh v14.4S, v4.4S, v26.4S -mul v4.4S, v4.4S,v27.4S -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v3.4s, v4.4s -add v3.4s, v3.4s, v4.4s -sqrdmulh v4.4S, v17.4S, v16.4S -mul v17.4S, v17.4S,v25.4S -mla v17.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -str q3, [x0, #64] -str q14, [x0, #80] -str q10, [x0, #96] -str q4, [x0, #112] -ldr q4, [x17, #+384] -ldr q10, [x17, #+400] -ldr q14, [x17, #+416] -ldr q3, [x17, #+432] -ldr q17, [x17, #+448] -ldr q2, [x17, #+464] -ldr q7, [x17, #+480] -ldr q9, [x17, #+496] -ldr q16, [x0, #160] -ldr q25, [x0, #176] -ldr q26, [x0, #128] -ldr q27, [x0, #144] -sqrdmulh v29.4S, v16.4S, v10.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v10.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v10.s[1] -mul v27.4S, v27.4S,v4.s[1] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v16.4S, v10.s[2] -mul v16.4S, v16.4S,v4.s[2] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v16.4s -add v29.4s, v29.4s, v16.4s -trn1 v16.4S, v26.4S, v25.4S -trn2 v28.4S, v26.4S, v25.4S -trn1 v24.4S, v29.4S, v27.4S -trn2 v30.4S, v29.4S, v27.4S -trn2 v29.2D, v16.2D, v24.2D -trn2 v27.2D, v28.2D, v30.2D -trn1 v26.2D, v16.2D, v24.2D -trn1 v25.2D, v28.2D, v30.2D -sqrdmulh v30.4S, v29.4S, v3.4S -mul v29.4S, v29.4S,v14.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v3.4S -mul v27.4S, v27.4S,v14.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v27.4s -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v2.4S -mul v25.4S, v25.4S,v17.4S -mla v25.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v9.4S -mul v29.4S, v29.4S,v7.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -str q26, [x0, #128] -str q27, [x0, #144] -str q30, [x0, #160] -str q25, [x0, #176] -ldr q25, [x17, #+512] -ldr q30, [x17, #+528] -ldr q27, [x17, #+544] -ldr q26, [x17, #+560] -ldr q29, [x17, #+576] -ldr q28, [x17, #+592] -ldr q24, [x17, #+608] -ldr q16, [x17, #+624] -ldr q9, [x0, #224] -ldr q7, [x0, #240] -ldr q2, [x0, #192] -ldr q17, [x0, #208] -sqrdmulh v3.4S, v9.4S, v30.s[0] -mul v9.4S, v9.4S,v25.s[0] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v30.s[0] -mul v7.4S, v7.4S,v25.s[0] -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v7.4s -add v17.4s, v17.4s, v7.4s -sqrdmulh v7.4S, v17.4S, v30.s[1] -mul v17.4S, v17.4S,v25.s[1] -mla v17.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v30.s[2] -mul v9.4S, v9.4S,v25.s[2] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -trn1 v9.4S, v2.4S, v7.4S -trn2 v14.4S, v2.4S, v7.4S -trn1 v10.4S, v3.4S, v17.4S -trn2 v4.4S, v3.4S, v17.4S -trn2 v3.2D, v9.2D, v10.2D -trn2 v17.2D, v14.2D, v4.2D -trn1 v2.2D, v9.2D, v10.2D -trn1 v7.2D, v14.2D, v4.2D -sqrdmulh v4.4S, v3.4S, v26.4S -mul v3.4S, v3.4S,v27.4S -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v26.4S -mul v17.4S, v17.4S,v27.4S -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v7.4s, v17.4s -add v7.4s, v7.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v28.4S -mul v7.4S, v7.4S,v29.4S -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v3.4S, v16.4S -mul v3.4S, v3.4S,v24.4S -mla v3.4S, v7.4S, v31.s[0] -sub v7.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -str q2, [x0, #192] -str q17, [x0, #208] -str q4, [x0, #224] -str q7, [x0, #240] -ldr q7, [x17, #+640] -ldr q4, [x17, #+656] -ldr q17, [x17, #+672] -ldr q2, [x17, #+688] -ldr q3, [x17, #+704] -ldr q14, [x17, #+720] -ldr q10, [x17, #+736] -ldr q9, [x17, #+752] -ldr q16, [x0, #288] -ldr q24, [x0, #304] -ldr q28, [x0, #256] -ldr q29, [x0, #272] -sqrdmulh v26.4S, v16.4S, v4.s[0] -mul v16.4S, v16.4S,v7.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -sqrdmulh v16.4S, v24.4S, v4.s[0] -mul v24.4S, v24.4S,v7.s[0] -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v4.s[1] -mul v29.4S, v29.4S,v7.s[1] -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v16.4S, v4.s[2] -mul v16.4S, v16.4S,v7.s[2] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -trn1 v16.4S, v28.4S, v24.4S -trn2 v27.4S, v28.4S, v24.4S -trn1 v30.4S, v26.4S, v29.4S -trn2 v25.4S, v26.4S, v29.4S -trn2 v26.2D, v16.2D, v30.2D -trn2 v29.2D, v27.2D, v25.2D -trn1 v28.2D, v16.2D, v30.2D -trn1 v24.2D, v27.2D, v25.2D -sqrdmulh v25.4S, v26.4S, v2.4S -mul v26.4S, v26.4S,v17.4S -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v2.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v29.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v14.4S -mul v24.4S, v24.4S,v3.4S -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v26.4S, v9.4S -mul v26.4S, v26.4S,v10.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -str q28, [x0, #256] -str q29, [x0, #272] -str q25, [x0, #288] -str q24, [x0, #304] -ldr q24, [x17, #+768] -ldr q25, [x17, #+784] -ldr q29, [x17, #+800] -ldr q28, [x17, #+816] -ldr q26, [x17, #+832] -ldr q27, [x17, #+848] -ldr q30, [x17, #+864] -ldr q16, [x17, #+880] -ldr q9, [x0, #352] -ldr q10, [x0, #368] -ldr q14, [x0, #320] -ldr q3, [x0, #336] -sqrdmulh v2.4S, v9.4S, v25.s[0] -mul v9.4S, v9.4S,v24.s[0] -mla v9.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v9.4s -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v24.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v10.4s -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v25.s[1] -mul v3.4S, v3.4S,v24.s[1] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v25.s[2] -mul v9.4S, v9.4S,v24.s[2] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -trn1 v9.4S, v14.4S, v10.4S -trn2 v17.4S, v14.4S, v10.4S -trn1 v4.4S, v2.4S, v3.4S -trn2 v7.4S, v2.4S, v3.4S -trn2 v2.2D, v9.2D, v4.2D -trn2 v3.2D, v17.2D, v7.2D -trn1 v14.2D, v9.2D, v4.2D -trn1 v10.2D, v17.2D, v7.2D -sqrdmulh v7.4S, v2.4S, v28.4S -mul v2.4S, v2.4S,v29.4S -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v28.4S -mul v3.4S, v3.4S,v29.4S -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v3.4s -add v10.4s, v10.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v27.4S -mul v10.4S, v10.4S,v26.4S -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v10.4s -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v16.4S -mul v2.4S, v2.4S,v30.4S -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v2.4s -add v7.4s, v7.4s, v2.4s -str q14, [x0, #320] -str q3, [x0, #336] -str q7, [x0, #352] -str q10, [x0, #368] -ldr q10, [x17, #+896] -ldr q7, [x17, #+912] -ldr q3, [x17, #+928] -ldr q14, [x17, #+944] -ldr q2, [x17, #+960] -ldr q17, [x17, #+976] -ldr q4, [x17, #+992] -ldr q9, [x17, #+1008] -ldr q16, [x0, #416] -ldr q30, [x0, #432] -ldr q27, [x0, #384] -ldr q26, [x0, #400] -sqrdmulh v28.4S, v16.4S, v7.s[0] -mul v16.4S, v16.4S,v10.s[0] -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v30.4S, v7.s[0] -mul v30.4S, v30.4S,v10.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v7.s[1] -mul v26.4S, v26.4S,v10.s[1] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v7.s[2] -mul v16.4S, v16.4S,v10.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -trn1 v16.4S, v27.4S, v30.4S -trn2 v29.4S, v27.4S, v30.4S -trn1 v25.4S, v28.4S, v26.4S -trn2 v24.4S, v28.4S, v26.4S -trn2 v28.2D, v16.2D, v25.2D -trn2 v26.2D, v29.2D, v24.2D -trn1 v27.2D, v16.2D, v25.2D -trn1 v30.2D, v29.2D, v24.2D -sqrdmulh v24.4S, v28.4S, v14.4S -mul v28.4S, v28.4S,v3.4S -mla v28.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v14.4S -mul v26.4S, v26.4S,v3.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v17.4S -mul v30.4S, v30.4S,v2.4S -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v9.4S -mul v28.4S, v28.4S,v4.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -str q27, [x0, #384] -str q26, [x0, #400] -str q24, [x0, #416] -str q30, [x0, #432] -ldr q30, [x17, #+1024] -ldr q24, [x17, #+1040] -ldr q26, [x17, #+1056] -ldr q27, [x17, #+1072] -ldr q28, [x17, #+1088] -ldr q29, [x17, #+1104] -ldr q25, [x17, #+1120] -ldr q16, [x17, #+1136] -ldr q9, [x0, #480] -ldr q4, [x0, #496] -ldr q17, [x0, #448] -ldr q2, [x0, #464] -sqrdmulh v14.4S, v9.4S, v24.s[0] -mul v9.4S, v9.4S,v30.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v24.s[0] -mul v4.4S, v4.4S,v30.s[0] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v24.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v24.s[2] -mul v9.4S, v9.4S,v30.s[2] -mla v9.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v9.4s -add v14.4s, v14.4s, v9.4s -trn1 v9.4S, v17.4S, v4.4S -trn2 v3.4S, v17.4S, v4.4S -trn1 v7.4S, v14.4S, v2.4S -trn2 v10.4S, v14.4S, v2.4S -trn2 v14.2D, v9.2D, v7.2D -trn2 v2.2D, v3.2D, v10.2D -trn1 v17.2D, v9.2D, v7.2D -trn1 v4.2D, v3.2D, v10.2D -sqrdmulh v10.4S, v14.4S, v27.4S -mul v14.4S, v14.4S,v26.4S -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.4S -mul v2.4S, v2.4S,v26.4S -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v29.4S -mul v4.4S, v4.4S,v28.4S -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v25.4S -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -str q17, [x0, #448] -str q2, [x0, #464] -str q10, [x0, #480] -str q4, [x0, #496] -ldr q4, [x17, #+1152] -ldr q10, [x17, #+1168] -ldr q2, [x17, #+1184] -ldr q17, [x17, #+1200] -ldr q14, [x17, #+1216] -ldr q3, [x17, #+1232] -ldr q7, [x17, #+1248] -ldr q9, [x17, #+1264] -ldr q16, [x0, #544] -ldr q25, [x0, #560] -ldr q29, [x0, #512] -ldr q28, [x0, #528] -sqrdmulh v27.4S, v16.4S, v10.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v16.4s -add v29.4s, v29.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v10.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v10.s[1] -mul v28.4S, v28.4S,v4.s[1] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v16.4S, v10.s[2] -mul v16.4S, v16.4S,v4.s[2] -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -trn1 v16.4S, v29.4S, v25.4S -trn2 v26.4S, v29.4S, v25.4S -trn1 v24.4S, v27.4S, v28.4S -trn2 v30.4S, v27.4S, v28.4S -trn2 v27.2D, v16.2D, v24.2D -trn2 v28.2D, v26.2D, v30.2D -trn1 v29.2D, v16.2D, v24.2D -trn1 v25.2D, v26.2D, v30.2D -sqrdmulh v30.4S, v27.4S, v17.4S -mul v27.4S, v27.4S,v2.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v17.4S -mul v28.4S, v28.4S,v2.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v25.4S, v3.4S -mul v25.4S, v25.4S,v14.4S -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v25.4s -add v29.4s, v29.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v9.4S -mul v27.4S, v27.4S,v7.4S -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -str q29, [x0, #512] -str q28, [x0, #528] -str q30, [x0, #544] -str q25, [x0, #560] -ldr q25, [x17, #+1280] -ldr q30, [x17, #+1296] -ldr q28, [x17, #+1312] -ldr q29, [x17, #+1328] -ldr q27, [x17, #+1344] -ldr q26, [x17, #+1360] -ldr q24, [x17, #+1376] -ldr q16, [x17, #+1392] -ldr q9, [x0, #608] -ldr q7, [x0, #624] -ldr q3, [x0, #576] -ldr q14, [x0, #592] -sqrdmulh v17.4S, v9.4S, v30.s[0] -mul v9.4S, v9.4S,v25.s[0] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v30.s[0] -mul v7.4S, v7.4S,v25.s[0] -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v30.s[1] -mul v14.4S, v14.4S,v25.s[1] -mla v14.4S, v7.4S, v31.s[0] -sub v7.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v9.4S, v30.s[2] -mul v9.4S, v9.4S,v25.s[2] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -trn1 v9.4S, v3.4S, v7.4S -trn2 v2.4S, v3.4S, v7.4S -trn1 v10.4S, v17.4S, v14.4S -trn2 v4.4S, v17.4S, v14.4S -trn2 v17.2D, v9.2D, v10.2D -trn2 v14.2D, v2.2D, v4.2D -trn1 v3.2D, v9.2D, v10.2D -trn1 v7.2D, v2.2D, v4.2D -sqrdmulh v4.4S, v17.4S, v29.4S -mul v17.4S, v17.4S,v28.4S -mla v17.4S, v4.4S, v31.s[0] -sub v4.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v29.4S -mul v14.4S, v14.4S,v28.4S -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v7.4s, v14.4s -add v7.4s, v7.4s, v14.4s -sqrdmulh v14.4S, v7.4S, v26.4S -mul v7.4S, v7.4S,v27.4S -mla v7.4S, v14.4S, v31.s[0] -sub v14.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -sqrdmulh v7.4S, v17.4S, v16.4S -mul v17.4S, v17.4S,v24.4S -mla v17.4S, v7.4S, v31.s[0] -sub v7.4s, v4.4s, v17.4s -add v4.4s, v4.4s, v17.4s -str q3, [x0, #576] -str q14, [x0, #592] -str q4, [x0, #608] -str q7, [x0, #624] -ldr q7, [x17, #+1408] -ldr q4, [x17, #+1424] -ldr q14, [x17, #+1440] -ldr q3, [x17, #+1456] -ldr q17, [x17, #+1472] -ldr q2, [x17, #+1488] -ldr q10, [x17, #+1504] -ldr q9, [x17, #+1520] -ldr q16, [x0, #672] -ldr q24, [x0, #688] -ldr q26, [x0, #640] -ldr q27, [x0, #656] -sqrdmulh v29.4S, v16.4S, v4.s[0] -mul v16.4S, v16.4S,v7.s[0] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v24.4S, v4.s[0] -mul v24.4S, v24.4S,v7.s[0] -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v24.4s -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v27.4S, v4.s[1] -mul v27.4S, v27.4S,v7.s[1] -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v16.4S, v4.s[2] -mul v16.4S, v16.4S,v7.s[2] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v16.4s -add v29.4s, v29.4s, v16.4s -trn1 v16.4S, v26.4S, v24.4S -trn2 v28.4S, v26.4S, v24.4S -trn1 v30.4S, v29.4S, v27.4S -trn2 v25.4S, v29.4S, v27.4S -trn2 v29.2D, v16.2D, v30.2D -trn2 v27.2D, v28.2D, v25.2D -trn1 v26.2D, v16.2D, v30.2D -trn1 v24.2D, v28.2D, v25.2D -sqrdmulh v25.4S, v29.4S, v3.4S -mul v29.4S, v29.4S,v14.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v3.4S -mul v27.4S, v27.4S,v14.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v2.4S -mul v24.4S, v24.4S,v17.4S -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v9.4S -mul v29.4S, v29.4S,v10.4S -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -str q26, [x0, #640] -str q27, [x0, #656] -str q25, [x0, #672] -str q24, [x0, #688] -ldr q24, [x17, #+1536] -ldr q25, [x17, #+1552] -ldr q27, [x17, #+1568] -ldr q26, [x17, #+1584] -ldr q29, [x17, #+1600] -ldr q28, [x17, #+1616] -ldr q30, [x17, #+1632] -ldr q16, [x17, #+1648] -ldr q9, [x0, #736] -ldr q10, [x0, #752] -ldr q2, [x0, #704] -ldr q17, [x0, #720] -sqrdmulh v3.4S, v9.4S, v25.s[0] -mul v9.4S, v9.4S,v24.s[0] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v24.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v10.4s -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[1] -mul v17.4S, v17.4S,v24.s[1] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v25.s[2] -mul v9.4S, v9.4S,v24.s[2] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v9.4s -add v3.4s, v3.4s, v9.4s -trn1 v9.4S, v2.4S, v10.4S -trn2 v14.4S, v2.4S, v10.4S -trn1 v4.4S, v3.4S, v17.4S -trn2 v7.4S, v3.4S, v17.4S -trn2 v3.2D, v9.2D, v4.2D -trn2 v17.2D, v14.2D, v7.2D -trn1 v2.2D, v9.2D, v4.2D -trn1 v10.2D, v14.2D, v7.2D -sqrdmulh v7.4S, v3.4S, v26.4S -mul v3.4S, v3.4S,v27.4S -mla v3.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v26.4S -mul v17.4S, v17.4S,v27.4S -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v28.4S -mul v10.4S, v10.4S,v29.4S -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v16.4S -mul v3.4S, v3.4S,v30.4S -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -str q2, [x0, #704] -str q17, [x0, #720] -str q7, [x0, #736] -str q10, [x0, #752] -ldr q10, [x17, #+1664] -ldr q7, [x17, #+1680] -ldr q17, [x17, #+1696] -ldr q2, [x17, #+1712] -ldr q3, [x17, #+1728] -ldr q14, [x17, #+1744] -ldr q4, [x17, #+1760] -ldr q9, [x17, #+1776] -ldr q16, [x0, #800] -ldr q30, [x0, #816] -ldr q28, [x0, #768] -ldr q29, [x0, #784] -sqrdmulh v26.4S, v16.4S, v7.s[0] -mul v16.4S, v16.4S,v10.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -sqrdmulh v16.4S, v30.4S, v7.s[0] -mul v30.4S, v30.4S,v10.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v7.s[1] -mul v29.4S, v29.4S,v10.s[1] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v16.4S, v7.s[2] -mul v16.4S, v16.4S,v10.s[2] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -trn1 v16.4S, v28.4S, v30.4S -trn2 v27.4S, v28.4S, v30.4S -trn1 v25.4S, v26.4S, v29.4S -trn2 v24.4S, v26.4S, v29.4S -trn2 v26.2D, v16.2D, v25.2D -trn2 v29.2D, v27.2D, v24.2D -trn1 v28.2D, v16.2D, v25.2D -trn1 v30.2D, v27.2D, v24.2D -sqrdmulh v24.4S, v26.4S, v2.4S -mul v26.4S, v26.4S,v17.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v2.4S -mul v29.4S, v29.4S,v17.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v14.4S -mul v30.4S, v30.4S,v3.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v9.4S -mul v26.4S, v26.4S,v4.4S -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -str q28, [x0, #768] -str q29, [x0, #784] -str q24, [x0, #800] -str q30, [x0, #816] -ldr q30, [x17, #+1792] -ldr q24, [x17, #+1808] -ldr q29, [x17, #+1824] -ldr q28, [x17, #+1840] -ldr q26, [x17, #+1856] -ldr q27, [x17, #+1872] -ldr q25, [x17, #+1888] -ldr q16, [x17, #+1904] -ldr q9, [x0, #864] -ldr q4, [x0, #880] -ldr q14, [x0, #832] -ldr q3, [x0, #848] -sqrdmulh v2.4S, v9.4S, v24.s[0] -mul v9.4S, v9.4S,v30.s[0] -mla v9.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v9.4s -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v24.s[0] -mul v4.4S, v4.4S,v30.s[0] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v4.4s -add v3.4s, v3.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v24.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v24.s[2] -mul v9.4S, v9.4S,v30.s[2] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -trn1 v9.4S, v14.4S, v4.4S -trn2 v17.4S, v14.4S, v4.4S -trn1 v7.4S, v2.4S, v3.4S -trn2 v10.4S, v2.4S, v3.4S -trn2 v2.2D, v9.2D, v7.2D -trn2 v3.2D, v17.2D, v10.2D -trn1 v14.2D, v9.2D, v7.2D -trn1 v4.2D, v17.2D, v10.2D -sqrdmulh v10.4S, v2.4S, v28.4S -mul v2.4S, v2.4S,v29.4S -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v28.4S -mul v3.4S, v3.4S,v29.4S -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v4.4S, v27.4S -mul v4.4S, v4.4S,v26.4S -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v16.4S -mul v2.4S, v2.4S,v25.4S -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -str q14, [x0, #832] -str q3, [x0, #848] -str q10, [x0, #864] -str q4, [x0, #880] -ldr q4, [x17, #+1920] -ldr q10, [x17, #+1936] -ldr q3, [x17, #+1952] -ldr q14, [x17, #+1968] -ldr q2, [x17, #+1984] -ldr q17, [x17, #+2000] -ldr q7, [x17, #+2016] -ldr q9, [x17, #+2032] -ldr q16, [x0, #928] -ldr q25, [x0, #944] -ldr q27, [x0, #896] -ldr q26, [x0, #912] -sqrdmulh v28.4S, v16.4S, v10.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v10.s[0] -mul v25.4S, v25.4S,v4.s[0] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v10.s[1] -mul v26.4S, v26.4S,v4.s[1] -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v10.s[2] -mul v16.4S, v16.4S,v4.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -trn1 v16.4S, v27.4S, v25.4S -trn2 v29.4S, v27.4S, v25.4S -trn1 v24.4S, v28.4S, v26.4S -trn2 v30.4S, v28.4S, v26.4S -trn2 v28.2D, v16.2D, v24.2D -trn2 v26.2D, v29.2D, v30.2D -trn1 v27.2D, v16.2D, v24.2D -trn1 v25.2D, v29.2D, v30.2D -sqrdmulh v30.4S, v28.4S, v14.4S -mul v28.4S, v28.4S,v3.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v14.4S -mul v26.4S, v26.4S,v3.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v17.4S -mul v25.4S, v25.4S,v2.4S -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v9.4S -mul v28.4S, v28.4S,v7.4S -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -str q27, [x0, #896] -str q26, [x0, #912] -str q30, [x0, #928] -str q25, [x0, #944] -ldr q25, [x17, #+2048] -ldr q30, [x17, #+2064] -ldr q26, [x17, #+2080] -ldr q27, [x17, #+2096] -ldr q28, [x17, #+2112] -ldr q29, [x17, #+2128] -ldr q24, [x17, #+2144] -ldr q16, [x17, #+2160] -ldr q9, [x0, #992] -ldr q7, [x0, #1008] -ldr q17, [x0, #960] -ldr q2, [x0, #976] -sqrdmulh v14.4S, v9.4S, v30.s[0] -mul v9.4S, v9.4S,v25.s[0] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v30.s[0] -mul v7.4S, v7.4S,v25.s[0] -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v2.4S, v30.s[1] -mul v2.4S, v2.4S,v25.s[1] -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v30.s[2] -mul v9.4S, v9.4S,v25.s[2] -mla v9.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v9.4s -add v14.4s, v14.4s, v9.4s -trn1 v9.4S, v17.4S, v7.4S -trn2 v3.4S, v17.4S, v7.4S -trn1 v10.4S, v14.4S, v2.4S -trn2 v4.4S, v14.4S, v2.4S -trn2 v14.2D, v9.2D, v10.2D -trn2 v2.2D, v3.2D, v4.2D -trn1 v17.2D, v9.2D, v10.2D -trn1 v7.2D, v3.2D, v4.2D -sqrdmulh v4.4S, v14.4S, v27.4S -mul v14.4S, v14.4S,v26.4S -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v17.4s, v14.4s -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.4S -mul v2.4S, v2.4S,v26.4S -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v7.4s, v2.4s -add v7.4s, v7.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v29.4S -mul v7.4S, v7.4S,v28.4S -mla v7.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v7.4s -add v17.4s, v17.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v16.4S -mul v14.4S, v14.4S,v24.4S -mla v14.4S, v7.4S, v31.s[0] -sub v7.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -str q17, [x0, #960] -str q2, [x0, #976] -str q4, [x0, #992] -str q7, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s b/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s deleted file mode 100644 index df18890..0000000 --- a/tests/ntt_neon/auto/ntt_u32_full_33556993_28678040_var_4_4_9_0.s +++ /dev/null @@ -1,2422 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 26036764 // Layer 6, block 0 -.word 7065381 // Layer 6, block 1 -.word 11280567 // Layer 6, block 2 -.word 19695786 // Layer 6, block 3 -.word 1666225723 // Layer 6, block 0 -.word 452149874 // Layer 6, block 1 -.word 721901190 // Layer 6, block 2 -.word 1260434103 // Layer 6, block 3 -.word 28678040 // Layer 7, block 0 -.word 5637166 // Layer 7, block 2 -.word 18759424 // Layer 7, block 4 -.word 8648030 // Layer 7, block 6 -.word 1835254486 // Layer 7, block 0 -.word 360751090 // Layer 7, block 2 -.word 1200511508 // Layer 7, block 4 -.word 553431680 // Layer 7, block 6 -.word 7232147 // Layer 7, block 1 -.word 7430689 // Layer 7, block 3 -.word 14819378 // Layer 7, block 5 -.word 22112339 // Layer 7, block 7 -.word 462822084 // Layer 7, block 1 -.word 475527802 // Layer 7, block 3 -.word 948367809 // Layer 7, block 5 -.word 1415081692 // Layer 7, block 7 -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14834498 // Layer 6, block 4 -.word 22861321 // Layer 6, block 5 -.word 23033862 // Layer 6, block 6 -.word 32211066 // Layer 6, block 7 -.word 949335415 // Layer 6, block 4 -.word 1463012881 // Layer 6, block 5 -.word 1474054663 // Layer 6, block 6 -.word 2061350894 // Layer 6, block 7 -.word 7103825 // Layer 7, block 8 -.word 24338119 // Layer 7, block 10 -.word 6674394 // Layer 7, block 12 -.word 3716128 // Layer 7, block 14 -.word 454610102 // Layer 7, block 8 -.word 1557520740 // Layer 7, block 10 -.word 427128616 // Layer 7, block 12 -.word 237814041 // Layer 7, block 14 -.word 18577393 // Layer 7, block 9 -.word 17042091 // Layer 7, block 11 -.word 6574213 // Layer 7, block 13 -.word 24666803 // Layer 7, block 15 -.word 1188862414 // Layer 7, block 9 -.word 1090610585 // Layer 7, block 11 -.word 420717521 // Layer 7, block 13 -.word 1578554911 // Layer 7, block 15 -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 11253846 // Layer 6, block 8 -.word 16151303 // Layer 6, block 9 -.word 1821442 // Layer 6, block 10 -.word 23358663 // Layer 6, block 11 -.word 720191176 // Layer 6, block 8 -.word 1033604503 // Layer 6, block 9 -.word 116563391 // Layer 6, block 10 -.word 1494840340 // Layer 6, block 11 -.word 32787475 // Layer 7, block 16 -.word 8269259 // Layer 7, block 18 -.word 20826321 // Layer 7, block 20 -.word 21194054 // Layer 7, block 22 -.word 2098238255 // Layer 7, block 16 -.word 529192186 // Layer 7, block 18 -.word 1332782821 // Layer 7, block 20 -.word 1356315937 // Layer 7, block 22 -.word 28400654 // Layer 7, block 17 -.word 31090287 // Layer 7, block 19 -.word 26776841 // Layer 7, block 21 -.word 22281074 // Layer 7, block 23 -.word 1817503137 // Layer 7, block 17 -.word 1989626512 // Layer 7, block 19 -.word 1713587037 // Layer 7, block 21 -.word 1425879908 // Layer 7, block 23 -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 20504641 // Layer 6, block 12 -.word 7735096 // Layer 6, block 13 -.word 29463916 // Layer 6, block 14 -.word 23172067 // Layer 6, block 15 -.word 1312196872 // Layer 6, block 12 -.word 495008363 // Layer 6, block 13 -.word 1885546712 // Layer 6, block 14 -.word 1482899108 // Layer 6, block 15 -.word 1953000 // Layer 7, block 24 -.word 12766243 // Layer 7, block 26 -.word 16292342 // Layer 7, block 28 -.word 25143337 // Layer 7, block 30 -.word 124982461 // Layer 7, block 24 -.word 816977197 // Layer 7, block 26 -.word 1042630311 // Layer 7, block 28 -.word 1609050759 // Layer 7, block 30 -.word 12486848 // Layer 7, block 25 -.word 31556661 // Layer 7, block 27 -.word 28330310 // Layer 7, block 29 -.word 15137961 // Layer 7, block 31 -.word 799097282 // Layer 7, block 25 -.word 2019472170 // Layer 7, block 27 -.word 1813001465 // Layer 7, block 29 -.word 968755565 // Layer 7, block 31 -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 18663828 // Layer 6, block 16 -.word 25765932 // Layer 6, block 17 -.word 11779122 // Layer 6, block 18 -.word 29112305 // Layer 6, block 19 -.word 1194393831 // Layer 6, block 16 -.word 1648893798 // Layer 6, block 17 -.word 753806275 // Layer 6, block 18 -.word 1863045325 // Layer 6, block 19 -.word 33163184 // Layer 7, block 32 -.word 11550623 // Layer 7, block 34 -.word 25375595 // Layer 7, block 36 -.word 18254638 // Layer 7, block 38 -.word 2122281795 // Layer 7, block 32 -.word 739183455 // Layer 7, block 34 -.word 1623914137 // Layer 7, block 36 -.word 1168207670 // Layer 7, block 38 -.word 9551359 // Layer 7, block 33 -.word 33257316 // Layer 7, block 35 -.word 10387700 // Layer 7, block 37 -.word 4263629 // Layer 7, block 39 -.word 611240324 // Layer 7, block 33 -.word 2128305784 // Layer 7, block 35 -.word 664762063 // Layer 7, block 37 -.word 272851431 // Layer 7, block 39 -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 596073 // Layer 6, block 20 -.word 29039358 // Layer 6, block 21 -.word 6760262 // Layer 6, block 22 -.word 2228887 // Layer 6, block 23 -.word 38145761 // Layer 6, block 20 -.word 1858377074 // Layer 6, block 21 -.word 432623749 // Layer 6, block 22 -.word 142637881 // Layer 6, block 23 -.word 25929180 // Layer 7, block 40 -.word 23508428 // Layer 7, block 42 -.word 22560727 // Layer 7, block 44 -.word 29457393 // Layer 7, block 46 -.word 1659340873 // Layer 7, block 40 -.word 1504424569 // Layer 7, block 42 -.word 1443776334 // Layer 7, block 44 -.word 1885129272 // Layer 7, block 46 -.word 17371159 // Layer 7, block 41 -.word 11558208 // Layer 7, block 43 -.word 15755637 // Layer 7, block 45 -.word 20740787 // Layer 7, block 47 -.word 1111669329 // Layer 7, block 41 -.word 739668858 // Layer 7, block 43 -.word 1008283812 // Layer 7, block 45 -.word 1327309063 // Layer 7, block 47 -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 13624329 // Layer 6, block 24 -.word 9838349 // Layer 6, block 25 -.word 6934560 // Layer 6, block 26 -.word 11310234 // Layer 6, block 27 -.word 871890510 // Layer 6, block 24 -.word 629606282 // Layer 6, block 25 -.word 443777969 // Layer 6, block 26 -.word 723799733 // Layer 6, block 27 -.word 3153984 // Layer 7, block 48 -.word 15599806 // Layer 7, block 50 -.word 23484790 // Layer 7, block 52 -.word 30174454 // Layer 7, block 54 -.word 201839571 // Layer 7, block 48 -.word 998311389 // Layer 7, block 50 -.word 1502911852 // Layer 7, block 52 -.word 1931017673 // Layer 7, block 54 -.word 13598070 // Layer 7, block 49 -.word 31454003 // Layer 7, block 51 -.word 20506260 // Layer 7, block 53 -.word 5928435 // Layer 7, block 55 -.word 870210062 // Layer 7, block 49 -.word 2012902560 // Layer 7, block 51 -.word 1312300480 // Layer 7, block 53 -.word 379390883 // Layer 7, block 55 -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 32798516 // Layer 6, block 28 -.word 9911360 // Layer 6, block 29 -.word 32443170 // Layer 6, block 30 -.word 31293482 // Layer 6, block 31 -.word 2098944825 // Layer 6, block 28 -.word 634278629 // Layer 6, block 29 -.word 2076204416 // Layer 6, block 30 -.word 2002630000 // Layer 6, block 31 -.word 26013877 // Layer 7, block 56 -.word 22928950 // Layer 7, block 58 -.word 24547058 // Layer 7, block 60 -.word 21082546 // Layer 7, block 62 -.word 1664761067 // Layer 7, block 56 -.word 1467340807 // Layer 7, block 58 -.word 1570891816 // Layer 7, block 60 -.word 1349179970 // Layer 7, block 62 -.word 21864746 // Layer 7, block 57 -.word 27678266 // Layer 7, block 59 -.word 30695887 // Layer 7, block 61 -.word 31772478 // Layer 7, block 63 -.word 1399236949 // Layer 7, block 57 -.word 1771273834 // Layer 7, block 59 -.word 1964386839 // Layer 7, block 61 -.word 2033283404 // Layer 7, block 63 -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 2853776 // Layer 6, block 32 -.word 31645959 // Layer 6, block 33 -.word 29723614 // Layer 6, block 34 -.word 31813171 // Layer 6, block 35 -.word 182627725 // Layer 6, block 32 -.word 2025186806 // Layer 6, block 33 -.word 1902166116 // Layer 6, block 34 -.word 2035887557 // Layer 6, block 35 -.word 30377953 // Layer 7, block 64 -.word 4924837 // Layer 7, block 66 -.word 11362575 // Layer 7, block 68 -.word 31398766 // Layer 7, block 70 -.word 1944040616 // Layer 7, block 64 -.word 315165513 // Layer 7, block 66 -.word 727149301 // Layer 7, block 68 -.word 2009367662 // Layer 7, block 70 -.word 27689101 // Layer 7, block 65 -.word 31229525 // Layer 7, block 67 -.word 6544948 // Layer 7, block 69 -.word 13728247 // Layer 7, block 71 -.word 1771967221 // Layer 7, block 65 -.word 1998537064 // Layer 7, block 67 -.word 418844704 // Layer 7, block 69 -.word 878540754 // Layer 7, block 71 -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9116920 // Layer 6, block 36 -.word 26449800 // Layer 6, block 37 -.word 27173300 // Layer 6, block 38 -.word 1574249 // Layer 6, block 39 -.word 583438350 // Layer 6, block 36 -.word 1692658010 // Layer 6, block 37 -.word 1738958476 // Layer 6, block 38 -.word 100744247 // Layer 6, block 39 -.word 6510145 // Layer 7, block 72 -.word 760999 // Layer 7, block 74 -.word 1634503 // Layer 7, block 76 -.word 29546109 // Layer 7, block 78 -.word 416617482 // Layer 7, block 72 -.word 48700219 // Layer 7, block 74 -.word 104600209 // Layer 7, block 76 -.word 1890806663 // Layer 7, block 78 -.word 2195232 // Layer 7, block 73 -.word 4465852 // Layer 7, block 75 -.word 31203102 // Layer 7, block 77 -.word 29916743 // Layer 7, block 79 -.word 140484126 // Layer 7, block 73 -.word 285792715 // Layer 7, block 75 -.word 1996846121 // Layer 7, block 77 -.word 1914525428 // Layer 7, block 79 -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29172999 // Layer 6, block 40 -.word 16825951 // Layer 6, block 41 -.word 11592382 // Layer 6, block 42 -.word 2671395 // Layer 6, block 43 -.word 1866929445 // Layer 6, block 40 -.word 1076778680 // Layer 6, block 41 -.word 741855827 // Layer 6, block 42 -.word 170956232 // Layer 6, block 43 -.word 14579779 // Layer 7, block 80 -.word 24263513 // Layer 7, block 82 -.word 4646776 // Layer 7, block 84 -.word 69049 // Layer 7, block 86 -.word 933034643 // Layer 7, block 80 -.word 1552746321 // Layer 7, block 82 -.word 297370968 // Layer 7, block 84 -.word 4418799 // Layer 7, block 86 -.word 33263488 // Layer 7, block 81 -.word 22493246 // Layer 7, block 83 -.word 22009979 // Layer 7, block 85 -.word 12021234 // Layer 7, block 87 -.word 2128700762 // Layer 7, block 81 -.word 1439457879 // Layer 7, block 83 -.word 1408531152 // Layer 7, block 85 -.word 769300260 // Layer 7, block 87 -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 15720958 // Layer 6, block 44 -.word 4876619 // Layer 6, block 45 -.word 9370171 // Layer 6, block 46 -.word 2197027 // Layer 6, block 47 -.word 1006064525 // Layer 6, block 44 -.word 312079797 // Layer 6, block 45 -.word 599645177 // Layer 6, block 46 -.word 140598997 // Layer 6, block 47 -.word 16117282 // Layer 7, block 88 -.word 9635661 // Layer 7, block 90 -.word 9117520 // Layer 7, block 92 -.word 3506913 // Layer 7, block 94 -.word 1031427326 // Layer 7, block 88 -.word 616635240 // Layer 7, block 90 -.word 583476747 // Layer 7, block 92 -.word 224425303 // Layer 7, block 94 -.word 20014407 // Layer 7, block 89 -.word 25893988 // Layer 7, block 91 -.word 10257619 // Layer 7, block 93 -.word 24501669 // Layer 7, block 95 -.word 1280824291 // Layer 7, block 89 -.word 1657088757 // Layer 7, block 91 -.word 656437514 // Layer 7, block 93 -.word 1567987141 // Layer 7, block 95 -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 23467272 // Layer 6, block 48 -.word 11944835 // Layer 6, block 49 -.word 29768154 // Layer 6, block 50 -.word 3189790 // Layer 6, block 51 -.word 1501790786 // Layer 6, block 48 -.word 764411097 // Layer 6, block 49 -.word 1905016458 // Layer 6, block 50 -.word 204130980 // Layer 6, block 51 -.word 28559032 // Layer 7, block 96 -.word 20151609 // Layer 7, block 98 -.word 11645481 // Layer 7, block 100 -.word 16402437 // Layer 7, block 102 -.word 1827638556 // Layer 7, block 96 -.word 1289604549 // Layer 7, block 98 -.word 745253903 // Layer 7, block 100 -.word 1049675853 // Layer 7, block 102 -.word 1005359 // Layer 7, block 97 -.word 19130139 // Layer 7, block 99 -.word 11690281 // Layer 7, block 101 -.word 5461508 // Layer 7, block 103 -.word 64338065 // Layer 7, block 97 -.word 1224235458 // Layer 7, block 99 -.word 748120885 // Layer 7, block 101 -.word 349509836 // Layer 7, block 103 -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 4898455 // Layer 6, block 52 -.word 22059944 // Layer 6, block 53 -.word 20315246 // Layer 6, block 54 -.word 28615767 // Layer 6, block 55 -.word 313477194 // Layer 6, block 52 -.word 1411728668 // Layer 6, block 53 -.word 1300076517 // Layer 6, block 54 -.word 1831269319 // Layer 6, block 55 -.word 6226096 // Layer 7, block 104 -.word 14029790 // Layer 7, block 106 -.word 7729000 // Layer 7, block 108 -.word 13958531 // Layer 7, block 110 -.word 398439734 // Layer 7, block 104 -.word 897838034 // Layer 7, block 106 -.word 494618249 // Layer 7, block 108 -.word 893277806 // Layer 7, block 110 -.word 31755058 // Layer 7, block 105 -.word 26102744 // Layer 7, block 107 -.word 19175904 // Layer 7, block 109 -.word 19472238 // Layer 7, block 111 -.word 2032168609 // Layer 7, block 105 -.word 1670448121 // Layer 7, block 107 -.word 1227164194 // Layer 7, block 109 -.word 1246128123 // Layer 7, block 111 -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 17302560 // Layer 6, block 56 -.word 8630188 // Layer 6, block 57 -.word 13744680 // Layer 6, block 58 -.word 31890906 // Layer 6, block 59 -.word 1107279328 // Layer 6, block 56 -.word 552289879 // Layer 6, block 57 -.word 879592386 // Layer 6, block 58 -.word 2040862218 // Layer 6, block 59 -.word 4735938 // Layer 7, block 112 -.word 26671657 // Layer 7, block 114 -.word 25810971 // Layer 7, block 116 -.word 25578690 // Layer 7, block 118 -.word 303076900 // Layer 7, block 112 -.word 1706855774 // Layer 7, block 114 -.word 1651776074 // Layer 7, block 116 -.word 1636911225 // Layer 7, block 118 -.word 6957373 // Layer 7, block 113 -.word 25381712 // Layer 7, block 115 -.word 27780827 // Layer 7, block 117 -.word 28062311 // Layer 7, block 119 -.word 445237890 // Layer 7, block 113 -.word 1624305595 // Layer 7, block 115 -.word 1777837237 // Layer 7, block 117 -.word 1795850838 // Layer 7, block 119 -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 26150922 // Layer 6, block 60 -.word 29525906 // Layer 6, block 61 -.word 23080870 // Layer 6, block 62 -.word 1636987 // Layer 6, block 63 -.word 1673531278 // Layer 6, block 60 -.word 1889513769 // Layer 6, block 61 -.word 1477062945 // Layer 6, block 62 -.word 104759172 // Layer 6, block 63 -.word 10674616 // Layer 7, block 120 -.word 9508293 // Layer 7, block 122 -.word 4274200 // Layer 7, block 124 -.word 10066304 // Layer 7, block 126 -.word 683123285 // Layer 7, block 120 -.word 608484310 // Layer 7, block 122 -.word 273527923 // Layer 7, block 124 -.word 644194289 // Layer 7, block 126 -.word 26473446 // Layer 7, block 121 -.word 14853570 // Layer 7, block 123 -.word 32427548 // Layer 7, block 125 -.word 16598340 // Layer 7, block 127 -.word 1694171239 // Layer 7, block 121 -.word 950555930 // Layer 7, block 123 -.word 2075204685 // Layer 7, block 125 -.word 1062212688 // Layer 7, block 127 -.text -.global ntt_u32_full_neon_asm_var_4_4_9_0 -.global _ntt_u32_full_neon_asm_var_4_4_9_0 -ntt_u32_full_neon_asm_var_4_4_9_0: -_ntt_u32_full_neon_asm_var_4_4_9_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -str q20, [x0, #416] -sub v20.4s, v16.4s, v3.4s -ldr q1, [x0, #1008] -sqrdmulh v14.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -str q11, [x0, #480] -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #816] -sqrdmulh v11.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v1.4S, v14.4S, v31.s[0] -str q18, [x0, #160] -sub v18.4s, v22.4s, v15.4s -mla v3.4S, v11.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -str q2, [x0, #224] -add v22.4s, v22.4s, v15.4s -ldr q15, [x0, #560] -sqrdmulh v2.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -str q16, [x0, #32] -sub v16.4s, v0.4s, v17.4s -ldr q9, [x0, #624] -sqrdmulh v11.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -str q20, [x0, #96] -add v0.4s, v0.4s, v17.4s -ldr q17, [x0, #688] -ldr q20, [x0, #432] -sqrdmulh v14.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v7.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -ldr q1, [x0, #304] -mla v15.4S, v2.4S, v31.s[0] -mla v9.4S, v11.4S, v31.s[0] -str q10, [x0, #544] -sub v10.4s, v1.4s, v3.4s -mla v17.4S, v14.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -str q13, [x0, #608] -add v1.4s, v1.4s, v3.4s -ldr q3, [x0, #368] -sqrdmulh v13.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -str q21, [x0, #672] -sub v21.4s, v3.4s, v8.4s -sqrdmulh v5.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -str q12, [x0, #736] -add v3.4s, v3.4s, v8.4s -ldr q8, [x0, #48] -sqrdmulh v12.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v14.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -ldr q15, [x0, #112] -sqrdmulh v11.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v2.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -ldr q9, [x0, #176] -mla v20.4S, v13.4S, v31.s[0] -mla v6.4S, v5.4S, v31.s[0] -str q22, [x0, #800] -sub v22.4s, v9.4s, v17.4s -mla v1.4S, v12.4S, v31.s[0] -mla v3.4S, v11.4S, v31.s[0] -str q18, [x0, #864] -add v9.4s, v9.4s, v17.4s -ldr q17, [x0, #240] -sqrdmulh v18.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -str q0, [x0, #928] -sub v0.4s, v17.4s, v19.4s -sqrdmulh v11.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -str q16, [x0, #992] -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v16.4s, v9.4s, v20.4s -add v9.4s, v9.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v12.4s, v17.4s, v6.4s -add v17.4s, v17.4s, v6.4s -mla v7.4S, v18.4S, v31.s[0] -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v8.4s, v1.4s -mla v10.4S, v19.4S, v31.s[0] -mla v21.4S, v20.4S, v31.s[0] -add v8.4s, v8.4s, v1.4s -sqrdmulh v1.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v20.4s, v15.4s, v3.4s -sqrdmulh v19.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v27.s[0] -mul v9.4S, v9.4S,v28.s[0] -sub v18.4s, v22.4s, v7.4s -add v22.4s, v22.4s, v7.4s -sqrdmulh v7.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -mla v16.4S, v1.4S, v31.s[0] -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v10.4s -mla v9.4S, v3.4S, v31.s[0] -mla v17.4S, v7.4S, v31.s[0] -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v27.s[2] -mul v22.4S, v22.4S,v28.s[2] -sub v7.4s, v2.4s, v21.4s -sqrdmulh v3.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v1.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v22.4S, v10.4S, v31.s[0] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v9.4s -mla v18.4S, v21.4S, v31.s[0] -mla v6.4S, v16.4S, v31.s[0] -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v20.4S, v25.s[2] -mul v20.4S, v20.4S,v26.s[2] -sub v16.4s, v15.4s, v17.4s -sqrdmulh v21.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v10.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v12.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -mla v20.4S, v9.4S, v31.s[0] -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v18.4s -mla v16.4S, v17.4S, v31.s[0] -mla v15.4S, v22.4S, v31.s[0] -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -sub v22.4s, v7.4s, v6.4s -sqrdmulh v17.4S, v12.4S, v23.s[1] -mul v12.4S, v12.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v9.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v22.4S, v23.s[3] -mul v22.4S, v22.4S,v24.s[3] -sub v0.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -mla v2.4S, v18.4S, v31.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v16.4s -str q11, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v22.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v16.4s -str q9, [x0, #368] -ldr q9, [x0, #896] -sqrdmulh v16.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -str q1, [x0, #432] -sub v1.4s, v8.4s, v15.4s -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -str q0, [x0, #496] -add v8.4s, v8.4s, v15.4s -ldr q15, [x0, #768] -sqrdmulh v0.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v11.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -ldr q2, [x0, #832] -sqrdmulh v18.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v4.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -mla v9.4S, v16.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -str q3, [x0, #176] -sub v3.4s, v19.4s, v7.4s -mla v15.4S, v0.4S, v31.s[0] -mla v2.4S, v18.4S, v31.s[0] -str q17, [x0, #240] -add v19.4s, v19.4s, v7.4s -ldr q7, [x0, #512] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -str q8, [x0, #48] -sub v8.4s, v21.4s, v22.4s -ldr q18, [x0, #576] -sqrdmulh v0.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q1, [x0, #112] -add v21.4s, v21.4s, v22.4s -ldr q22, [x0, #640] -ldr q1, [x0, #384] -sqrdmulh v6.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v16.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -ldr q9, [x0, #704] -ldr q12, [x0, #448] -sqrdmulh v5.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v13.4s, v12.4s, v20.4s -add v12.4s, v12.4s, v20.4s -ldr q20, [x0, #256] -mla v7.4S, v17.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -str q14, [x0, #560] -sub v14.4s, v20.4s, v15.4s -mla v22.4S, v6.4S, v31.s[0] -mla v9.4S, v5.4S, v31.s[0] -str q11, [x0, #624] -add v20.4s, v20.4s, v15.4s -ldr q15, [x0, #320] -sqrdmulh v11.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -str q10, [x0, #688] -sub v10.4s, v15.4s, v2.4s -sqrdmulh v5.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -str q4, [x0, #752] -add v15.4s, v15.4s, v2.4s -ldr q2, [x0, #0] -sqrdmulh v4.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v6.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -ldr q7, [x0, #64] -sqrdmulh v0.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v17.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -ldr q18, [x0, #128] -mla v1.4S, v11.4S, v31.s[0] -mla v12.4S, v5.4S, v31.s[0] -str q19, [x0, #816] -sub v19.4s, v18.4s, v22.4s -mla v20.4S, v4.4S, v31.s[0] -mla v15.4S, v0.4S, v31.s[0] -str q3, [x0, #880] -add v18.4s, v18.4s, v22.4s -ldr q22, [x0, #192] -sqrdmulh v3.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -str q21, [x0, #944] -sub v21.4s, v22.4s, v9.4s -sqrdmulh v0.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -str q8, [x0, #1008] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v8.4s, v18.4s, v1.4s -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v4.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -mla v16.4S, v3.4S, v31.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v2.4s, v20.4s -mla v14.4S, v9.4S, v31.s[0] -mla v10.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v1.4s, v7.4s, v15.4s -sqrdmulh v9.4S, v4.4S, v27.s[1] -mul v4.4S, v4.4S,v28.s[1] -add v7.4s, v7.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v27.s[0] -mul v18.4S, v18.4S,v28.s[0] -sub v3.4s, v19.4s, v16.4s -add v19.4s, v19.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v12.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -mla v8.4S, v20.4S, v31.s[0] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v14.4s -mla v18.4S, v15.4S, v31.s[0] -mla v22.4S, v16.4S, v31.s[0] -add v6.4s, v6.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v16.4s, v17.4s, v10.4s -sqrdmulh v15.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v20.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -sub v13.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -mla v19.4S, v14.4S, v31.s[0] -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v18.4s -mla v3.4S, v10.4S, v31.s[0] -mla v12.4S, v8.4S, v31.s[0] -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v8.4s, v7.4s, v22.4s -sqrdmulh v10.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v25.s[1] -mul v8.4S, v8.4S,v26.s[1] -sub v14.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -sqrdmulh v19.4S, v7.4S, v25.s[0] -mul v7.4S, v7.4S,v26.s[0] -sub v4.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -mla v1.4S, v18.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v3.4s -mla v8.4S, v22.4S, v31.s[0] -mla v7.4S, v19.4S, v31.s[0] -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v23.s[0] -mul v17.4S, v17.4S,v24.s[0] -sub v19.4s, v16.4s, v12.4s -sqrdmulh v22.4S, v4.4S, v23.s[1] -mul v4.4S, v4.4S,v24.s[1] -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v18.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v1.4S, v19.4S, v23.s[3] -mul v19.4S, v19.4S,v24.s[3] -sub v21.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -mla v17.4S, v3.4S, v31.s[0] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v8.4s -str q0, [x0, #256] -mla v16.4S, v12.4S, v31.s[0] -mla v19.4S, v1.4S, v31.s[0] -add v15.4s, v15.4s, v8.4s -str q18, [x0, #320] -ldr q18, [x0, #912] -sqrdmulh v8.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q20, [x0, #384] -sub v20.4s, v2.4s, v7.4s -ldr q1, [x0, #976] -sqrdmulh v12.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -str q21, [x0, #448] -add v2.4s, v2.4s, v7.4s -ldr q7, [x0, #784] -sqrdmulh v21.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v0.4s, v6.4s, v17.4s -add v6.4s, v6.4s, v17.4s -ldr q17, [x0, #848] -sqrdmulh v3.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v13.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -mla v18.4S, v8.4S, v31.s[0] -mla v1.4S, v12.4S, v31.s[0] -str q15, [x0, #128] -sub v15.4s, v9.4s, v16.4s -mla v7.4S, v21.4S, v31.s[0] -mla v17.4S, v3.4S, v31.s[0] -str q22, [x0, #192] -add v9.4s, v9.4s, v16.4s -ldr q16, [x0, #528] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q2, [x0, #0] -sub v2.4s, v10.4s, v19.4s -ldr q3, [x0, #592] -sqrdmulh v21.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -str q20, [x0, #64] -add v10.4s, v10.4s, v19.4s -ldr q19, [x0, #656] -ldr q20, [x0, #400] -sqrdmulh v12.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v8.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #720] -ldr q4, [x0, #464] -sqrdmulh v5.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v11.4s, v4.4s, v1.4s -add v4.4s, v4.4s, v1.4s -ldr q1, [x0, #272] -mla v16.4S, v22.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -str q6, [x0, #512] -sub v6.4s, v1.4s, v7.4s -mla v19.4S, v12.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -str q0, [x0, #576] -add v1.4s, v1.4s, v7.4s -ldr q7, [x0, #336] -sqrdmulh v0.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -str q14, [x0, #640] -sub v14.4s, v7.4s, v17.4s -sqrdmulh v5.4S, v4.4S, v29.s[1] -mul v4.4S, v4.4S,v30.s[1] -str q13, [x0, #704] -add v7.4s, v7.4s, v17.4s -ldr q17, [x0, #16] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v12.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -ldr q16, [x0, #80] -sqrdmulh v21.4S, v7.4S, v29.s[1] -mul v7.4S, v7.4S,v30.s[1] -sub v22.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #144] -mla v20.4S, v0.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -str q9, [x0, #768] -sub v9.4s, v3.4s, v19.4s -mla v1.4S, v13.4S, v31.s[0] -mla v7.4S, v21.4S, v31.s[0] -str q15, [x0, #832] -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #208] -sqrdmulh v15.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -str q10, [x0, #896] -sub v10.4s, v19.4s, v18.4s -sqrdmulh v21.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -str q2, [x0, #960] -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v6.4S, v29.s[2] -mul v6.4S, v6.4S,v30.s[2] -sub v2.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v13.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -mla v8.4S, v15.4S, v31.s[0] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v1.4s -mla v6.4S, v18.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v27.s[1] -mul v2.4S, v2.4S,v28.s[1] -sub v20.4s, v16.4s, v7.4s -sqrdmulh v18.4S, v13.4S, v27.s[1] -mul v13.4S, v13.4S,v28.s[1] -add v16.4s, v16.4s, v7.4s -sqrdmulh v7.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v15.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v4.4s, v10.4s, v11.4s -add v10.4s, v10.4s, v11.4s -mla v2.4S, v1.4S, v31.s[0] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v6.4s -mla v3.4S, v7.4S, v31.s[0] -mla v19.4S, v8.4S, v31.s[0] -add v12.4s, v12.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v27.s[2] -mul v9.4S, v9.4S,v28.s[2] -sub v8.4s, v22.4s, v14.4s -sqrdmulh v7.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v1.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v27.s[3] -mul v4.4S, v4.4S,v28.s[3] -sub v11.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -mla v9.4S, v6.4S, v31.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -mla v4.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v25.s[2] -mul v20.4S, v20.4S,v26.s[2] -sub v2.4s, v16.4s, v19.4s -sqrdmulh v14.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v25.s[1] -mul v2.4S, v2.4S,v26.s[1] -sub v6.4s, v12.4s, v9.4s -add v12.4s, v12.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v25.s[0] -mul v16.4S, v16.4S,v26.s[0] -sub v13.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -mla v20.4S, v3.4S, v31.s[0] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v19.4S, v31.s[0] -mla v16.4S, v9.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v23.s[0] -mul v22.4S, v22.4S,v24.s[0] -sub v9.4s, v8.4s, v4.4s -sqrdmulh v19.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v8.4S, v23.s[2] -mul v8.4S, v8.4S,v24.s[2] -sub v3.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v23.s[3] -mul v9.4S, v9.4S,v24.s[3] -sub v10.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -mla v22.4S, v15.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v7.4s, v2.4s -str q21, [x0, #272] -mla v8.4S, v4.4S, v31.s[0] -mla v9.4S, v20.4S, v31.s[0] -add v7.4s, v7.4s, v2.4s -str q3, [x0, #336] -str q1, [x0, #400] -sub v1.4s, v17.4s, v16.4s -str q10, [x0, #464] -add v17.4s, v17.4s, v16.4s -sub v16.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sub v22.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -str q7, [x0, #144] -sub v7.4s, v18.4s, v8.4s -str q19, [x0, #208] -add v18.4s, v18.4s, v8.4s -str q17, [x0, #16] -sub v17.4s, v14.4s, v9.4s -str q1, [x0, #80] -add v14.4s, v14.4s, v9.4s -str q12, [x0, #528] -str q16, [x0, #592] -str q6, [x0, #656] -str q22, [x0, #720] -str q18, [x0, #784] -str q7, [x0, #848] -str q14, [x0, #912] -str q17, [x0, #976] -ldr q0, [x17, #+128] -ldr q5, [x17, #+144] -ldr q11, [x17, #+160] -ldr q15, [x17, #+176] -ldr q21, [x17, #+192] -ldr q4, [x17, #+208] -ldr q20, [x17, #+224] -ldr q2, [x17, #+240] -ldr q3, [x0, #32] -ldr q30, [x0, #48] -ldr q29, [x0, #0] -ldr q28, [x0, #16] -sqrdmulh v27.4S, v3.4S, v5.s[0] -mul v3.4S, v3.4S,v0.s[0] -mla v3.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v5.s[0] -mul v30.4S, v30.4S,v0.s[0] -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v5.s[1] -mul v28.4S, v28.4S,v0.s[1] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v5.s[2] -mul v3.4S, v3.4S,v0.s[2] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v3.4s -add v27.4s, v27.4s, v3.4s -trn1 v3.4S, v29.4S, v30.4S -trn2 v26.4S, v29.4S, v30.4S -trn1 v25.4S, v27.4S, v28.4S -trn2 v24.4S, v27.4S, v28.4S -trn2 v27.2D, v3.2D, v25.2D -trn2 v28.2D, v26.2D, v24.2D -trn1 v29.2D, v3.2D, v25.2D -trn1 v30.2D, v26.2D, v24.2D -sqrdmulh v24.4S, v27.4S, v15.4S -mul v27.4S, v27.4S,v11.4S -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v15.4S -mul v28.4S, v28.4S,v11.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v30.4S, v4.4S -mul v30.4S, v30.4S,v21.4S -mla v30.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v2.4S -mul v27.4S, v27.4S,v20.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -str q29, [x0, #0] -str q28, [x0, #16] -str q24, [x0, #32] -str q30, [x0, #48] -ldr q30, [x17, #+256] -ldr q24, [x17, #+272] -ldr q28, [x17, #+288] -ldr q29, [x17, #+304] -ldr q27, [x17, #+320] -ldr q26, [x17, #+336] -ldr q25, [x17, #+352] -ldr q3, [x17, #+368] -ldr q2, [x0, #96] -ldr q20, [x0, #112] -ldr q4, [x0, #64] -ldr q21, [x0, #80] -sqrdmulh v15.4S, v2.4S, v24.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v24.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v24.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v4.4s, v21.4s -add v4.4s, v4.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v24.s[2] -mul v2.4S, v2.4S,v30.s[2] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -trn1 v2.4S, v4.4S, v20.4S -trn2 v11.4S, v4.4S, v20.4S -trn1 v5.4S, v15.4S, v21.4S -trn2 v0.4S, v15.4S, v21.4S -trn2 v15.2D, v2.2D, v5.2D -trn2 v21.2D, v11.2D, v0.2D -trn1 v4.2D, v2.2D, v5.2D -trn1 v20.2D, v11.2D, v0.2D -sqrdmulh v0.4S, v15.4S, v29.4S -mul v15.4S, v15.4S,v28.4S -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v15.4s -add v4.4s, v4.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v29.4S -mul v21.4S, v21.4S,v28.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v26.4S -mul v20.4S, v20.4S,v27.4S -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v3.4S -mul v15.4S, v15.4S,v25.4S -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -str q4, [x0, #64] -str q21, [x0, #80] -str q0, [x0, #96] -str q20, [x0, #112] -ldr q20, [x17, #+384] -ldr q0, [x17, #+400] -ldr q21, [x17, #+416] -ldr q4, [x17, #+432] -ldr q15, [x17, #+448] -ldr q11, [x17, #+464] -ldr q5, [x17, #+480] -ldr q2, [x17, #+496] -ldr q3, [x0, #160] -ldr q25, [x0, #176] -ldr q26, [x0, #128] -ldr q27, [x0, #144] -sqrdmulh v29.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v20.s[0] -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v0.s[0] -mul v25.4S, v25.4S,v20.s[0] -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v0.s[1] -mul v27.4S, v27.4S,v20.s[1] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v0.s[2] -mul v3.4S, v3.4S,v20.s[2] -mla v3.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -trn1 v3.4S, v26.4S, v25.4S -trn2 v28.4S, v26.4S, v25.4S -trn1 v24.4S, v29.4S, v27.4S -trn2 v30.4S, v29.4S, v27.4S -trn2 v29.2D, v3.2D, v24.2D -trn2 v27.2D, v28.2D, v30.2D -trn1 v26.2D, v3.2D, v24.2D -trn1 v25.2D, v28.2D, v30.2D -sqrdmulh v30.4S, v29.4S, v4.4S -mul v29.4S, v29.4S,v21.4S -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v4.4S -mul v27.4S, v27.4S,v21.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v27.4s -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v11.4S -mul v25.4S, v25.4S,v15.4S -mla v25.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v2.4S -mul v29.4S, v29.4S,v5.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -str q26, [x0, #128] -str q27, [x0, #144] -str q30, [x0, #160] -str q25, [x0, #176] -ldr q25, [x17, #+512] -ldr q30, [x17, #+528] -ldr q27, [x17, #+544] -ldr q26, [x17, #+560] -ldr q29, [x17, #+576] -ldr q28, [x17, #+592] -ldr q24, [x17, #+608] -ldr q3, [x17, #+624] -ldr q2, [x0, #224] -ldr q5, [x0, #240] -ldr q11, [x0, #192] -ldr q15, [x0, #208] -sqrdmulh v4.4S, v2.4S, v30.s[0] -mul v2.4S, v2.4S,v25.s[0] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v5.4S, v30.s[0] -mul v5.4S, v5.4S,v25.s[0] -mla v5.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v30.s[1] -mul v15.4S, v15.4S,v25.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v30.s[2] -mul v2.4S, v2.4S,v25.s[2] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -trn1 v2.4S, v11.4S, v5.4S -trn2 v21.4S, v11.4S, v5.4S -trn1 v0.4S, v4.4S, v15.4S -trn2 v20.4S, v4.4S, v15.4S -trn2 v4.2D, v2.2D, v0.2D -trn2 v15.2D, v21.2D, v20.2D -trn1 v11.2D, v2.2D, v0.2D -trn1 v5.2D, v21.2D, v20.2D -sqrdmulh v20.4S, v4.4S, v26.4S -mul v4.4S, v4.4S,v27.4S -mla v4.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v4.4s -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v27.4S -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v15.4s -add v5.4s, v5.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v28.4S -mul v5.4S, v5.4S,v29.4S -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v3.4S -mul v4.4S, v4.4S,v24.4S -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -str q11, [x0, #192] -str q15, [x0, #208] -str q20, [x0, #224] -str q5, [x0, #240] -ldr q5, [x17, #+640] -ldr q20, [x17, #+656] -ldr q15, [x17, #+672] -ldr q11, [x17, #+688] -ldr q4, [x17, #+704] -ldr q21, [x17, #+720] -ldr q0, [x17, #+736] -ldr q2, [x17, #+752] -ldr q3, [x0, #288] -ldr q24, [x0, #304] -ldr q28, [x0, #256] -ldr q29, [x0, #272] -sqrdmulh v26.4S, v3.4S, v20.s[0] -mul v3.4S, v3.4S,v5.s[0] -mla v3.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v3.4s -add v28.4s, v28.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v20.s[0] -mul v24.4S, v24.4S,v5.s[0] -mla v24.4S, v3.4S, v31.s[0] -sub v3.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v20.s[1] -mul v29.4S, v29.4S,v5.s[1] -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v3.4S, v20.s[2] -mul v3.4S, v3.4S,v5.s[2] -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -trn1 v3.4S, v28.4S, v24.4S -trn2 v27.4S, v28.4S, v24.4S -trn1 v30.4S, v26.4S, v29.4S -trn2 v25.4S, v26.4S, v29.4S -trn2 v26.2D, v3.2D, v30.2D -trn2 v29.2D, v27.2D, v25.2D -trn1 v28.2D, v3.2D, v30.2D -trn1 v24.2D, v27.2D, v25.2D -sqrdmulh v25.4S, v26.4S, v11.4S -mul v26.4S, v26.4S,v15.4S -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v11.4S -mul v29.4S, v29.4S,v15.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v29.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v21.4S -mul v24.4S, v24.4S,v4.4S -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v26.4S, v2.4S -mul v26.4S, v26.4S,v0.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -str q28, [x0, #256] -str q29, [x0, #272] -str q25, [x0, #288] -str q24, [x0, #304] -ldr q24, [x17, #+768] -ldr q25, [x17, #+784] -ldr q29, [x17, #+800] -ldr q28, [x17, #+816] -ldr q26, [x17, #+832] -ldr q27, [x17, #+848] -ldr q30, [x17, #+864] -ldr q3, [x17, #+880] -ldr q2, [x0, #352] -ldr q0, [x0, #368] -ldr q21, [x0, #320] -ldr q4, [x0, #336] -sqrdmulh v11.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v25.s[0] -mul v0.4S, v0.4S,v24.s[0] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v0.4s -add v4.4s, v4.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v25.s[1] -mul v4.4S, v4.4S,v24.s[1] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v24.s[2] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -trn1 v2.4S, v21.4S, v0.4S -trn2 v15.4S, v21.4S, v0.4S -trn1 v20.4S, v11.4S, v4.4S -trn2 v5.4S, v11.4S, v4.4S -trn2 v11.2D, v2.2D, v20.2D -trn2 v4.2D, v15.2D, v5.2D -trn1 v21.2D, v2.2D, v20.2D -trn1 v0.2D, v15.2D, v5.2D -sqrdmulh v5.4S, v11.4S, v28.4S -mul v11.4S, v11.4S,v29.4S -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v4.4S, v28.4S -mul v4.4S, v4.4S,v29.4S -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v27.4S -mul v0.4S, v0.4S,v26.4S -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v21.4s, v0.4s -add v21.4s, v21.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v3.4S -mul v11.4S, v11.4S,v30.4S -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -str q21, [x0, #320] -str q4, [x0, #336] -str q5, [x0, #352] -str q0, [x0, #368] -ldr q0, [x17, #+896] -ldr q5, [x17, #+912] -ldr q4, [x17, #+928] -ldr q21, [x17, #+944] -ldr q11, [x17, #+960] -ldr q15, [x17, #+976] -ldr q20, [x17, #+992] -ldr q2, [x17, #+1008] -ldr q3, [x0, #416] -ldr q30, [x0, #432] -ldr q27, [x0, #384] -ldr q26, [x0, #400] -sqrdmulh v28.4S, v3.4S, v5.s[0] -mul v3.4S, v3.4S,v0.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v3.4s -add v27.4s, v27.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v5.s[0] -mul v30.4S, v30.4S,v0.s[0] -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v5.s[1] -mul v26.4S, v26.4S,v0.s[1] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v3.4S, v5.s[2] -mul v3.4S, v3.4S,v0.s[2] -mla v3.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v3.4s -add v28.4s, v28.4s, v3.4s -trn1 v3.4S, v27.4S, v30.4S -trn2 v29.4S, v27.4S, v30.4S -trn1 v25.4S, v28.4S, v26.4S -trn2 v24.4S, v28.4S, v26.4S -trn2 v28.2D, v3.2D, v25.2D -trn2 v26.2D, v29.2D, v24.2D -trn1 v27.2D, v3.2D, v25.2D -trn1 v30.2D, v29.2D, v24.2D -sqrdmulh v24.4S, v28.4S, v21.4S -mul v28.4S, v28.4S,v4.4S -mla v28.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v21.4S -mul v26.4S, v26.4S,v4.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v15.4S -mul v30.4S, v30.4S,v11.4S -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v2.4S -mul v28.4S, v28.4S,v20.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -str q27, [x0, #384] -str q26, [x0, #400] -str q24, [x0, #416] -str q30, [x0, #432] -ldr q30, [x17, #+1024] -ldr q24, [x17, #+1040] -ldr q26, [x17, #+1056] -ldr q27, [x17, #+1072] -ldr q28, [x17, #+1088] -ldr q29, [x17, #+1104] -ldr q25, [x17, #+1120] -ldr q3, [x17, #+1136] -ldr q2, [x0, #480] -ldr q20, [x0, #496] -ldr q15, [x0, #448] -ldr q11, [x0, #464] -sqrdmulh v21.4S, v2.4S, v24.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v24.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v24.s[1] -mul v11.4S, v11.4S,v30.s[1] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v2.4S, v24.s[2] -mul v2.4S, v2.4S,v30.s[2] -mla v2.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -trn1 v2.4S, v15.4S, v20.4S -trn2 v4.4S, v15.4S, v20.4S -trn1 v5.4S, v21.4S, v11.4S -trn2 v0.4S, v21.4S, v11.4S -trn2 v21.2D, v2.2D, v5.2D -trn2 v11.2D, v4.2D, v0.2D -trn1 v15.2D, v2.2D, v5.2D -trn1 v20.2D, v4.2D, v0.2D -sqrdmulh v0.4S, v21.4S, v27.4S -mul v21.4S, v21.4S,v26.4S -mla v21.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v27.4S -mul v11.4S, v11.4S,v26.4S -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v20.4S, v29.4S -mul v20.4S, v20.4S,v28.4S -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v3.4S -mul v21.4S, v21.4S,v25.4S -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -str q15, [x0, #448] -str q11, [x0, #464] -str q0, [x0, #480] -str q20, [x0, #496] -ldr q20, [x17, #+1152] -ldr q0, [x17, #+1168] -ldr q11, [x17, #+1184] -ldr q15, [x17, #+1200] -ldr q21, [x17, #+1216] -ldr q4, [x17, #+1232] -ldr q5, [x17, #+1248] -ldr q2, [x17, #+1264] -ldr q3, [x0, #544] -ldr q25, [x0, #560] -ldr q29, [x0, #512] -ldr q28, [x0, #528] -sqrdmulh v27.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v20.s[0] -mla v3.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v0.s[0] -mul v25.4S, v25.4S,v20.s[0] -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v0.s[1] -mul v28.4S, v28.4S,v20.s[1] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v0.s[2] -mul v3.4S, v3.4S,v20.s[2] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v3.4s -add v27.4s, v27.4s, v3.4s -trn1 v3.4S, v29.4S, v25.4S -trn2 v26.4S, v29.4S, v25.4S -trn1 v24.4S, v27.4S, v28.4S -trn2 v30.4S, v27.4S, v28.4S -trn2 v27.2D, v3.2D, v24.2D -trn2 v28.2D, v26.2D, v30.2D -trn1 v29.2D, v3.2D, v24.2D -trn1 v25.2D, v26.2D, v30.2D -sqrdmulh v30.4S, v27.4S, v15.4S -mul v27.4S, v27.4S,v11.4S -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v15.4S -mul v28.4S, v28.4S,v11.4S -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v25.4S, v4.4S -mul v25.4S, v25.4S,v21.4S -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v25.4s -add v29.4s, v29.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v2.4S -mul v27.4S, v27.4S,v5.4S -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -str q29, [x0, #512] -str q28, [x0, #528] -str q30, [x0, #544] -str q25, [x0, #560] -ldr q25, [x17, #+1280] -ldr q30, [x17, #+1296] -ldr q28, [x17, #+1312] -ldr q29, [x17, #+1328] -ldr q27, [x17, #+1344] -ldr q26, [x17, #+1360] -ldr q24, [x17, #+1376] -ldr q3, [x17, #+1392] -ldr q2, [x0, #608] -ldr q5, [x0, #624] -ldr q4, [x0, #576] -ldr q21, [x0, #592] -sqrdmulh v15.4S, v2.4S, v30.s[0] -mul v2.4S, v2.4S,v25.s[0] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -sqrdmulh v2.4S, v5.4S, v30.s[0] -mul v5.4S, v5.4S,v25.s[0] -mla v5.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v30.s[1] -mul v21.4S, v21.4S,v25.s[1] -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v21.4s -add v4.4s, v4.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v30.s[2] -mul v2.4S, v2.4S,v25.s[2] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -trn1 v2.4S, v4.4S, v5.4S -trn2 v11.4S, v4.4S, v5.4S -trn1 v0.4S, v15.4S, v21.4S -trn2 v20.4S, v15.4S, v21.4S -trn2 v15.2D, v2.2D, v0.2D -trn2 v21.2D, v11.2D, v20.2D -trn1 v4.2D, v2.2D, v0.2D -trn1 v5.2D, v11.2D, v20.2D -sqrdmulh v20.4S, v15.4S, v29.4S -mul v15.4S, v15.4S,v28.4S -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v4.4s, v15.4s -add v4.4s, v4.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v29.4S -mul v21.4S, v21.4S,v28.4S -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v5.4S, v26.4S -mul v5.4S, v5.4S,v27.4S -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v3.4S -mul v15.4S, v15.4S,v24.4S -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -str q4, [x0, #576] -str q21, [x0, #592] -str q20, [x0, #608] -str q5, [x0, #624] -ldr q5, [x17, #+1408] -ldr q20, [x17, #+1424] -ldr q21, [x17, #+1440] -ldr q4, [x17, #+1456] -ldr q15, [x17, #+1472] -ldr q11, [x17, #+1488] -ldr q0, [x17, #+1504] -ldr q2, [x17, #+1520] -ldr q3, [x0, #672] -ldr q24, [x0, #688] -ldr q26, [x0, #640] -ldr q27, [x0, #656] -sqrdmulh v29.4S, v3.4S, v20.s[0] -mul v3.4S, v3.4S,v5.s[0] -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v20.s[0] -mul v24.4S, v24.4S,v5.s[0] -mla v24.4S, v3.4S, v31.s[0] -sub v3.4s, v27.4s, v24.4s -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v27.4S, v20.s[1] -mul v27.4S, v27.4S,v5.s[1] -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v20.s[2] -mul v3.4S, v3.4S,v5.s[2] -mla v3.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -trn1 v3.4S, v26.4S, v24.4S -trn2 v28.4S, v26.4S, v24.4S -trn1 v30.4S, v29.4S, v27.4S -trn2 v25.4S, v29.4S, v27.4S -trn2 v29.2D, v3.2D, v30.2D -trn2 v27.2D, v28.2D, v25.2D -trn1 v26.2D, v3.2D, v30.2D -trn1 v24.2D, v28.2D, v25.2D -sqrdmulh v25.4S, v29.4S, v4.4S -mul v29.4S, v29.4S,v21.4S -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v29.4s -add v26.4s, v26.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v4.4S -mul v27.4S, v27.4S,v21.4S -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v11.4S -mul v24.4S, v24.4S,v15.4S -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v29.4S, v2.4S -mul v29.4S, v29.4S,v0.4S -mla v29.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -str q26, [x0, #640] -str q27, [x0, #656] -str q25, [x0, #672] -str q24, [x0, #688] -ldr q24, [x17, #+1536] -ldr q25, [x17, #+1552] -ldr q27, [x17, #+1568] -ldr q26, [x17, #+1584] -ldr q29, [x17, #+1600] -ldr q28, [x17, #+1616] -ldr q30, [x17, #+1632] -ldr q3, [x17, #+1648] -ldr q2, [x0, #736] -ldr q0, [x0, #752] -ldr q11, [x0, #704] -ldr q15, [x0, #720] -sqrdmulh v4.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v25.s[0] -mul v0.4S, v0.4S,v24.s[0] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v24.s[1] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v24.s[2] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v2.4s -add v4.4s, v4.4s, v2.4s -trn1 v2.4S, v11.4S, v0.4S -trn2 v21.4S, v11.4S, v0.4S -trn1 v20.4S, v4.4S, v15.4S -trn2 v5.4S, v4.4S, v15.4S -trn2 v4.2D, v2.2D, v20.2D -trn2 v15.2D, v21.2D, v5.2D -trn1 v11.2D, v2.2D, v20.2D -trn1 v0.2D, v21.2D, v5.2D -sqrdmulh v5.4S, v4.4S, v26.4S -mul v4.4S, v4.4S,v27.4S -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v15.4S, v26.4S -mul v15.4S, v15.4S,v27.4S -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -sqrdmulh v15.4S, v0.4S, v28.4S -mul v0.4S, v0.4S,v29.4S -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v3.4S -mul v4.4S, v4.4S,v30.4S -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v4.4s -add v5.4s, v5.4s, v4.4s -str q11, [x0, #704] -str q15, [x0, #720] -str q5, [x0, #736] -str q0, [x0, #752] -ldr q0, [x17, #+1664] -ldr q5, [x17, #+1680] -ldr q15, [x17, #+1696] -ldr q11, [x17, #+1712] -ldr q4, [x17, #+1728] -ldr q21, [x17, #+1744] -ldr q20, [x17, #+1760] -ldr q2, [x17, #+1776] -ldr q3, [x0, #800] -ldr q30, [x0, #816] -ldr q28, [x0, #768] -ldr q29, [x0, #784] -sqrdmulh v26.4S, v3.4S, v5.s[0] -mul v3.4S, v3.4S,v0.s[0] -mla v3.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v3.4s -add v28.4s, v28.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v5.s[0] -mul v30.4S, v30.4S,v0.s[0] -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v29.4S, v5.s[1] -mul v29.4S, v29.4S,v0.s[1] -mla v29.4S, v30.4S, v31.s[0] -sub v30.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v3.4S, v5.s[2] -mul v3.4S, v3.4S,v0.s[2] -mla v3.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -trn1 v3.4S, v28.4S, v30.4S -trn2 v27.4S, v28.4S, v30.4S -trn1 v25.4S, v26.4S, v29.4S -trn2 v24.4S, v26.4S, v29.4S -trn2 v26.2D, v3.2D, v25.2D -trn2 v29.2D, v27.2D, v24.2D -trn1 v28.2D, v3.2D, v25.2D -trn1 v30.2D, v27.2D, v24.2D -sqrdmulh v24.4S, v26.4S, v11.4S -mul v26.4S, v26.4S,v15.4S -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v11.4S -mul v29.4S, v29.4S,v15.4S -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v30.4S, v21.4S -mul v30.4S, v30.4S,v4.4S -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v26.4S, v2.4S -mul v26.4S, v26.4S,v20.4S -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -str q28, [x0, #768] -str q29, [x0, #784] -str q24, [x0, #800] -str q30, [x0, #816] -ldr q30, [x17, #+1792] -ldr q24, [x17, #+1808] -ldr q29, [x17, #+1824] -ldr q28, [x17, #+1840] -ldr q26, [x17, #+1856] -ldr q27, [x17, #+1872] -ldr q25, [x17, #+1888] -ldr q3, [x17, #+1904] -ldr q2, [x0, #864] -ldr q20, [x0, #880] -ldr q21, [x0, #832] -ldr q4, [x0, #848] -sqrdmulh v11.4S, v2.4S, v24.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v24.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v20.4s -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v4.4S, v24.s[1] -mul v4.4S, v4.4S,v30.s[1] -mla v4.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v24.s[2] -mul v2.4S, v2.4S,v30.s[2] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -trn1 v2.4S, v21.4S, v20.4S -trn2 v15.4S, v21.4S, v20.4S -trn1 v5.4S, v11.4S, v4.4S -trn2 v0.4S, v11.4S, v4.4S -trn2 v11.2D, v2.2D, v5.2D -trn2 v4.2D, v15.2D, v0.2D -trn1 v21.2D, v2.2D, v5.2D -trn1 v20.2D, v15.2D, v0.2D -sqrdmulh v0.4S, v11.4S, v28.4S -mul v11.4S, v11.4S,v29.4S -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v4.4S, v28.4S -mul v4.4S, v4.4S,v29.4S -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v27.4S -mul v20.4S, v20.4S,v26.4S -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v3.4S -mul v11.4S, v11.4S,v25.4S -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -str q21, [x0, #832] -str q4, [x0, #848] -str q0, [x0, #864] -str q20, [x0, #880] -ldr q20, [x17, #+1920] -ldr q0, [x17, #+1936] -ldr q4, [x17, #+1952] -ldr q21, [x17, #+1968] -ldr q11, [x17, #+1984] -ldr q15, [x17, #+2000] -ldr q5, [x17, #+2016] -ldr q2, [x17, #+2032] -ldr q3, [x0, #928] -ldr q25, [x0, #944] -ldr q27, [x0, #896] -ldr q26, [x0, #912] -sqrdmulh v28.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v20.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v3.4s -add v27.4s, v27.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v0.s[0] -mul v25.4S, v25.4S,v20.s[0] -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v0.s[1] -mul v26.4S, v26.4S,v20.s[1] -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v3.4S, v0.s[2] -mul v3.4S, v3.4S,v20.s[2] -mla v3.4S, v26.4S, v31.s[0] -sub v26.4s, v28.4s, v3.4s -add v28.4s, v28.4s, v3.4s -trn1 v3.4S, v27.4S, v25.4S -trn2 v29.4S, v27.4S, v25.4S -trn1 v24.4S, v28.4S, v26.4S -trn2 v30.4S, v28.4S, v26.4S -trn2 v28.2D, v3.2D, v24.2D -trn2 v26.2D, v29.2D, v30.2D -trn1 v27.2D, v3.2D, v24.2D -trn1 v25.2D, v29.2D, v30.2D -sqrdmulh v30.4S, v28.4S, v21.4S -mul v28.4S, v28.4S,v4.4S -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v21.4S -mul v26.4S, v26.4S,v4.4S -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v15.4S -mul v25.4S, v25.4S,v11.4S -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v2.4S -mul v28.4S, v28.4S,v5.4S -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -str q27, [x0, #896] -str q26, [x0, #912] -str q30, [x0, #928] -str q25, [x0, #944] -ldr q25, [x17, #+2048] -ldr q30, [x17, #+2064] -ldr q26, [x17, #+2080] -ldr q27, [x17, #+2096] -ldr q28, [x17, #+2112] -ldr q29, [x17, #+2128] -ldr q24, [x17, #+2144] -ldr q3, [x17, #+2160] -ldr q2, [x0, #992] -ldr q5, [x0, #1008] -ldr q15, [x0, #960] -ldr q11, [x0, #976] -sqrdmulh v21.4S, v2.4S, v30.s[0] -mul v2.4S, v2.4S,v25.s[0] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v5.4S, v30.s[0] -mul v5.4S, v5.4S,v25.s[0] -mla v5.4S, v2.4S, v31.s[0] -sub v2.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v11.4S, v30.s[1] -mul v11.4S, v11.4S,v25.s[1] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v2.4S, v30.s[2] -mul v2.4S, v2.4S,v25.s[2] -mla v2.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -trn1 v2.4S, v15.4S, v5.4S -trn2 v4.4S, v15.4S, v5.4S -trn1 v0.4S, v21.4S, v11.4S -trn2 v20.4S, v21.4S, v11.4S -trn2 v21.2D, v2.2D, v0.2D -trn2 v11.2D, v4.2D, v20.2D -trn1 v15.2D, v2.2D, v0.2D -trn1 v5.2D, v4.2D, v20.2D -sqrdmulh v20.4S, v21.4S, v27.4S -mul v21.4S, v21.4S,v26.4S -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v27.4S -mul v11.4S, v11.4S,v26.4S -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -sqrdmulh v11.4S, v5.4S, v29.4S -mul v5.4S, v5.4S,v28.4S -mla v5.4S, v11.4S, v31.s[0] -sub v11.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v3.4S -mul v21.4S, v21.4S,v24.4S -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -str q15, [x0, #960] -str q11, [x0, #976] -str q20, [x0, #992] -str q5, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 2392 -// Instruction count: 2388 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s deleted file mode 100644 index 10f383d..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_0.s +++ /dev/null @@ -1,1474 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 23825509 // Layer 4, block 0 -.word 27028662 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 1307297022 // Layer 3, block 0 -.word 1524716204 // Layer 4, block 0 -.word 1729702351 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 14626653 // Layer 3, block 1 -.word 14833295 // Layer 4, block 2 -.word 2138810 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 936034350 // Layer 3, block 1 -.word 949258429 // Layer 4, block 2 -.word 136873393 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 29737761 // Layer 3, block 2 -.word 6490403 // Layer 4, block 4 -.word 19648405 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 1903071454 // Layer 3, block 2 -.word 415354091 // Layer 4, block 4 -.word 1257401950 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 30285189 // Layer 3, block 3 -.word 31254932 // Layer 4, block 6 -.word 26362414 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 1938104173 // Layer 3, block 3 -.word 2000162988 // Layer 4, block 6 -.word 1687065733 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 21289485 // Layer 3, block 4 -.word 572895 // Layer 4, block 8 -.word 26691971 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 1362423055 // Layer 3, block 4 -.word 36662482 // Layer 4, block 8 -.word 1708155771 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 9914896 // Layer 3, block 5 -.word 9249292 // Layer 4, block 10 -.word 29292862 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 634504916 // Layer 3, block 5 -.word 591909511 // Layer 4, block 10 -.word 1874600091 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 22603682 // Layer 3, block 6 -.word 8247799 // Layer 4, block 12 -.word 5086187 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 1446525244 // Layer 3, block 6 -.word 527818851 // Layer 4, block 12 -.word 325491125 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 16204162 // Layer 3, block 7 -.word 28113639 // Layer 4, block 14 -.word 8471290 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 1036987221 // Layer 3, block 7 -.word 1799135579 // Layer 4, block 14 -.word 542121183 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.text -.global ntt_u32_incomplete_neon_asm_var_3_3_0 -.global _ntt_u32_incomplete_neon_asm_var_3_3_0 -ntt_u32_incomplete_neon_asm_var_3_3_0: -_ntt_u32_incomplete_neon_asm_var_3_3_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x0, #960] -ldr q25, [x0, #832] -ldr q24, [x0, #576] -ldr q23, [x0, #704] -ldr q22, [x0, #448] -ldr q21, [x0, #320] -ldr q20, [x0, #64] -ldr q19, [x0, #192] -sqrdmulh v18.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v26.4s -add v22.4s, v22.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v20.4s, v24.4s -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v23.4S, v29.s[0] -mul v23.4S, v23.4S,v30.s[0] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v19.4s, v23.4s -add v19.4s, v19.4s, v23.4s -sqrdmulh v23.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v18.4s -add v24.4s, v24.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -mla v19.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -str q20, [x0, #64] -str q26, [x0, #192] -sqrdmulh v26.4S, v23.4S, v27.s[1] -mul v23.4S, v23.4S,v28.s[1] -mla v23.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v23.4s -add v22.4s, v22.4s, v23.4s -str q22, [x0, #320] -str q26, [x0, #448] -sqrdmulh v26.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v26.4S, v31.s[0] -sub v26.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -str q18, [x0, #832] -str q26, [x0, #960] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #576] -str q26, [x0, #704] -ldr q26, [x0, #976] -ldr q25, [x0, #848] -ldr q24, [x0, #592] -ldr q18, [x0, #720] -ldr q21, [x0, #464] -ldr q22, [x0, #336] -ldr q23, [x0, #80] -ldr q20, [x0, #208] -sqrdmulh v19.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v26.4s -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v22.4s -add v23.4s, v23.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v19.4s -add v24.4s, v24.4s, v19.4s -sqrdmulh v19.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v19.4S, v31.s[0] -sub v19.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v23.4s, v20.4s -add v23.4s, v23.4s, v20.4s -str q23, [x0, #80] -str q26, [x0, #208] -sqrdmulh v26.4S, v18.4S, v27.s[1] -mul v18.4S, v18.4S,v28.s[1] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -str q21, [x0, #336] -str q26, [x0, #464] -sqrdmulh v26.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -str q19, [x0, #848] -str q26, [x0, #976] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #592] -str q26, [x0, #720] -ldr q26, [x0, #992] -ldr q25, [x0, #864] -ldr q24, [x0, #608] -ldr q19, [x0, #736] -ldr q22, [x0, #480] -ldr q21, [x0, #352] -ldr q18, [x0, #96] -ldr q23, [x0, #224] -sqrdmulh v20.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v26.4s -add v22.4s, v22.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v24.4s -add v18.4s, v18.4s, v24.4s -sqrdmulh v24.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v19.4s -add v23.4s, v23.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v19.4S, v31.s[0] -sub v19.4s, v23.4s, v22.4s -add v23.4s, v23.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v23.4S, v27.s[0] -mul v23.4S, v23.4S,v28.s[0] -mla v23.4S, v26.4S, v31.s[0] -sub v26.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -str q18, [x0, #96] -str q26, [x0, #224] -sqrdmulh v26.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -str q22, [x0, #352] -str q26, [x0, #480] -sqrdmulh v26.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -str q20, [x0, #864] -str q26, [x0, #992] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #608] -str q26, [x0, #736] -ldr q26, [x0, #1008] -ldr q25, [x0, #880] -ldr q24, [x0, #624] -ldr q20, [x0, #752] -ldr q21, [x0, #496] -ldr q22, [x0, #368] -ldr q19, [x0, #112] -ldr q18, [x0, #240] -sqrdmulh v23.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v26.4s -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v19.4s, v24.4s -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v24.4S, v31.s[0] -sub v24.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v23.4S, v29.s[2] -mul v23.4S, v23.4S,v30.s[2] -mla v23.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v23.4s -add v24.4s, v24.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v23.4S, v31.s[0] -sub v23.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v27.s[0] -mul v18.4S, v18.4S,v28.s[0] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -str q19, [x0, #112] -str q26, [x0, #240] -sqrdmulh v26.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -str q21, [x0, #368] -str q26, [x0, #496] -sqrdmulh v26.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v23.4s, v22.4s -add v23.4s, v23.4s, v22.4s -str q23, [x0, #880] -str q26, [x0, #1008] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #624] -str q26, [x0, #752] -ldr q26, [x0, #896] -ldr q25, [x0, #768] -ldr q24, [x0, #512] -ldr q23, [x0, #640] -ldr q22, [x0, #384] -ldr q21, [x0, #256] -ldr q20, [x0, #0] -ldr q19, [x0, #128] -sqrdmulh v18.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v26.4s -add v22.4s, v22.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v20.4s, v24.4s -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v23.4S, v29.s[0] -mul v23.4S, v23.4S,v30.s[0] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v19.4s, v23.4s -add v19.4s, v19.4s, v23.4s -sqrdmulh v23.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v18.4s -add v24.4s, v24.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -mla v19.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -str q20, [x0, #0] -str q26, [x0, #128] -sqrdmulh v26.4S, v23.4S, v27.s[1] -mul v23.4S, v23.4S,v28.s[1] -mla v23.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v23.4s -add v22.4s, v22.4s, v23.4s -str q22, [x0, #256] -str q26, [x0, #384] -sqrdmulh v26.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v26.4S, v31.s[0] -sub v26.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -str q18, [x0, #768] -str q26, [x0, #896] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #512] -str q26, [x0, #640] -ldr q26, [x0, #912] -ldr q25, [x0, #784] -ldr q24, [x0, #528] -ldr q18, [x0, #656] -ldr q21, [x0, #400] -ldr q22, [x0, #272] -ldr q23, [x0, #16] -ldr q20, [x0, #144] -sqrdmulh v19.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v26.4s -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v22.4s -add v23.4s, v23.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v19.4s -add v24.4s, v24.4s, v19.4s -sqrdmulh v19.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v19.4S, v31.s[0] -sub v19.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v23.4s, v20.4s -add v23.4s, v23.4s, v20.4s -str q23, [x0, #16] -str q26, [x0, #144] -sqrdmulh v26.4S, v18.4S, v27.s[1] -mul v18.4S, v18.4S,v28.s[1] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -str q21, [x0, #272] -str q26, [x0, #400] -sqrdmulh v26.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -str q19, [x0, #784] -str q26, [x0, #912] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #528] -str q26, [x0, #656] -ldr q26, [x0, #928] -ldr q25, [x0, #800] -ldr q24, [x0, #544] -ldr q19, [x0, #672] -ldr q22, [x0, #416] -ldr q21, [x0, #288] -ldr q18, [x0, #32] -ldr q23, [x0, #160] -sqrdmulh v20.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v26.4s -add v22.4s, v22.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v24.4s -add v18.4s, v18.4s, v24.4s -sqrdmulh v24.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v19.4s -add v23.4s, v23.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v19.4S, v31.s[0] -sub v19.4s, v23.4s, v22.4s -add v23.4s, v23.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v23.4S, v27.s[0] -mul v23.4S, v23.4S,v28.s[0] -mla v23.4S, v26.4S, v31.s[0] -sub v26.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -str q18, [x0, #32] -str q26, [x0, #160] -sqrdmulh v26.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -str q22, [x0, #288] -str q26, [x0, #416] -sqrdmulh v26.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -str q20, [x0, #800] -str q26, [x0, #928] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #544] -str q26, [x0, #672] -ldr q26, [x0, #944] -ldr q25, [x0, #816] -ldr q24, [x0, #560] -ldr q20, [x0, #688] -ldr q21, [x0, #432] -ldr q22, [x0, #304] -ldr q19, [x0, #48] -ldr q18, [x0, #176] -sqrdmulh v23.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -mla v26.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v26.4s -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -mla v25.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v19.4s, v24.4s -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v24.4S, v31.s[0] -sub v24.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[1] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v23.4S, v29.s[2] -mul v23.4S, v23.4S,v30.s[2] -mla v23.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v23.4s -add v24.4s, v24.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -mla v26.4S, v23.4S, v31.s[0] -sub v23.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v27.s[0] -mul v18.4S, v18.4S,v28.s[0] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -str q19, [x0, #48] -str q26, [x0, #176] -sqrdmulh v26.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -str q21, [x0, #304] -str q26, [x0, #432] -sqrdmulh v26.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v23.4s, v22.4s -add v23.4s, v23.4s, v22.4s -str q23, [x0, #816] -str q26, [x0, #944] -sqrdmulh v26.4S, v24.4S, v27.s[2] -mul v24.4S, v24.4S,v28.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -str q25, [x0, #560] -str q26, [x0, #688] -ldr q4, [x17, #+64] -ldr q5, [x17, #+80] -ldr q6, [x17, #+96] -ldr q7, [x17, #+112] -ldr q8, [x0, #112] -ldr q9, [x0, #96] -ldr q10, [x0, #64] -ldr q11, [x0, #80] -ldr q12, [x0, #48] -ldr q13, [x0, #32] -ldr q14, [x0, #0] -ldr q15, [x0, #16] -sqrdmulh v0.4S, v8.4S, v5.s[0] -mul v8.4S, v8.4S,v4.s[0] -mla v8.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v8.4s -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v13.4s, v9.4s -add v13.4s, v13.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v5.s[0] -mul v10.4S, v10.4S,v4.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v10.4s -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v11.4S, v5.s[0] -mul v11.4S, v11.4S,v4.s[0] -mla v11.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v5.s[1] -mul v12.4S, v12.4S,v4.s[1] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -ldr q12, [x17, #+128] -ldr q1, [x17, #+144] -ldr q2, [x17, #+160] -ldr q3, [x17, #+176] -sqrdmulh v16.4S, v13.4S, v5.s[1] -mul v13.4S, v13.4S,v4.s[1] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v5.s[2] -mul v0.4S, v0.4S,v4.s[2] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v8.4S, v5.s[2] -mul v8.4S, v8.4S,v4.s[2] -mla v8.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v15.4S, v7.s[0] -mul v15.4S, v15.4S,v6.s[0] -mla v15.4S, v8.4S, v31.s[0] -sub v8.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -str q14, [x0, #0] -str q8, [x0, #16] -sqrdmulh v8.4S, v11.4S, v7.s[1] -mul v11.4S, v11.4S,v6.s[1] -mla v11.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v11.4s -add v16.4s, v16.4s, v11.4s -str q16, [x0, #32] -str q8, [x0, #48] -sqrdmulh v8.4S, v13.4S, v7.s[3] -mul v13.4S, v13.4S,v6.s[3] -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -str q0, [x0, #96] -str q8, [x0, #112] -sqrdmulh v8.4S, v10.4S, v7.s[2] -mul v10.4S, v10.4S,v6.s[2] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -str q9, [x0, #64] -str q8, [x0, #80] -ldr q8, [x0, #240] -ldr q9, [x0, #224] -ldr q10, [x0, #192] -ldr q0, [x0, #208] -ldr q13, [x0, #176] -ldr q16, [x0, #160] -ldr q11, [x0, #128] -ldr q14, [x0, #144] -sqrdmulh v15.4S, v8.4S, v1.s[0] -mul v8.4S, v8.4S,v12.s[0] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v13.4s, v8.4s -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v1.s[0] -mul v9.4S, v9.4S,v12.s[0] -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v1.s[0] -mul v10.4S, v10.4S,v12.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v1.s[0] -mul v0.4S, v0.4S,v12.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v1.s[1] -mul v13.4S, v13.4S,v12.s[1] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -ldr q13, [x17, #+192] -ldr q17, [x17, #+208] -ldr q18, [x17, #+224] -ldr q19, [x17, #+240] -sqrdmulh v20.4S, v16.4S, v1.s[1] -mul v16.4S, v16.4S,v12.s[1] -mla v16.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v1.s[2] -mul v15.4S, v15.4S,v12.s[2] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v1.s[2] -mul v8.4S, v8.4S,v12.s[2] -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v14.4S, v3.s[0] -mul v14.4S, v14.4S,v2.s[0] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -str q11, [x0, #128] -str q8, [x0, #144] -sqrdmulh v8.4S, v0.4S, v3.s[1] -mul v0.4S, v0.4S,v2.s[1] -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -str q20, [x0, #160] -str q8, [x0, #176] -sqrdmulh v8.4S, v16.4S, v3.s[3] -mul v16.4S, v16.4S,v2.s[3] -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v15.4s, v16.4s -add v15.4s, v15.4s, v16.4s -str q15, [x0, #224] -str q8, [x0, #240] -sqrdmulh v8.4S, v10.4S, v3.s[2] -mul v10.4S, v10.4S,v2.s[2] -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -str q9, [x0, #192] -str q8, [x0, #208] -ldr q7, [x0, #368] -ldr q6, [x0, #352] -ldr q5, [x0, #320] -ldr q4, [x0, #336] -ldr q8, [x0, #304] -ldr q9, [x0, #288] -ldr q10, [x0, #256] -ldr q15, [x0, #272] -sqrdmulh v16.4S, v7.4S, v17.s[0] -mul v7.4S, v7.4S,v13.s[0] -mla v7.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v7.4s -add v8.4s, v8.4s, v7.4s -sqrdmulh v7.4S, v6.4S, v17.s[0] -mul v6.4S, v6.4S,v13.s[0] -mla v6.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v17.s[0] -mul v5.4S, v5.4S,v13.s[0] -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v17.s[0] -mul v4.4S, v4.4S,v13.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v8.4S, v17.s[1] -mul v8.4S, v8.4S,v13.s[1] -mla v8.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -ldr q8, [x17, #+256] -ldr q20, [x17, #+272] -ldr q0, [x17, #+288] -ldr q11, [x17, #+304] -sqrdmulh v14.4S, v9.4S, v17.s[1] -mul v9.4S, v9.4S,v13.s[1] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v17.s[2] -mul v16.4S, v16.4S,v13.s[2] -mla v16.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v7.4S, v17.s[2] -mul v7.4S, v7.4S,v13.s[2] -mla v7.4S, v16.4S, v31.s[0] -sub v16.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v15.4S, v19.s[0] -mul v15.4S, v15.4S,v18.s[0] -mla v15.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -str q10, [x0, #256] -str q7, [x0, #272] -sqrdmulh v7.4S, v4.4S, v19.s[1] -mul v4.4S, v4.4S,v18.s[1] -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -str q14, [x0, #288] -str q7, [x0, #304] -sqrdmulh v7.4S, v9.4S, v19.s[3] -mul v9.4S, v9.4S,v18.s[3] -mla v9.4S, v7.4S, v31.s[0] -sub v7.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -str q16, [x0, #352] -str q7, [x0, #368] -sqrdmulh v7.4S, v5.4S, v19.s[2] -mul v5.4S, v5.4S,v18.s[2] -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -str q6, [x0, #320] -str q7, [x0, #336] -ldr q3, [x0, #496] -ldr q2, [x0, #480] -ldr q1, [x0, #448] -ldr q12, [x0, #464] -ldr q7, [x0, #432] -ldr q6, [x0, #416] -ldr q5, [x0, #384] -ldr q16, [x0, #400] -sqrdmulh v9.4S, v3.4S, v20.s[0] -mul v3.4S, v3.4S,v8.s[0] -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v20.s[0] -mul v2.4S, v2.4S,v8.s[0] -mla v2.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v2.4s -add v6.4s, v6.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v20.s[0] -mul v1.4S, v1.4S,v8.s[0] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v5.4s, v1.4s -add v5.4s, v5.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v20.s[0] -mul v12.4S, v12.4S,v8.s[0] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v7.4S, v20.s[1] -mul v7.4S, v7.4S,v8.s[1] -mla v7.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v7.4s -add v16.4s, v16.4s, v7.4s -ldr q7, [x17, #+320] -ldr q14, [x17, #+336] -ldr q4, [x17, #+352] -ldr q10, [x17, #+368] -sqrdmulh v15.4S, v6.4S, v20.s[1] -mul v6.4S, v6.4S,v8.s[1] -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v20.s[2] -mul v9.4S, v9.4S,v8.s[2] -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -sqrdmulh v9.4S, v3.4S, v20.s[2] -mul v3.4S, v3.4S,v8.s[2] -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v11.s[0] -mul v16.4S, v16.4S,v0.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -str q5, [x0, #384] -str q3, [x0, #400] -sqrdmulh v3.4S, v12.4S, v11.s[1] -mul v12.4S, v12.4S,v0.s[1] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -str q15, [x0, #416] -str q3, [x0, #432] -sqrdmulh v3.4S, v6.4S, v11.s[3] -mul v6.4S, v6.4S,v0.s[3] -mla v6.4S, v3.4S, v31.s[0] -sub v3.4s, v9.4s, v6.4s -add v9.4s, v9.4s, v6.4s -str q9, [x0, #480] -str q3, [x0, #496] -sqrdmulh v3.4S, v1.4S, v11.s[2] -mul v1.4S, v1.4S,v0.s[2] -mla v1.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -str q2, [x0, #448] -str q3, [x0, #464] -ldr q19, [x0, #624] -ldr q18, [x0, #608] -ldr q17, [x0, #576] -ldr q13, [x0, #592] -ldr q3, [x0, #560] -ldr q2, [x0, #544] -ldr q1, [x0, #512] -ldr q9, [x0, #528] -sqrdmulh v6.4S, v19.4S, v14.s[0] -mul v19.4S, v19.4S,v7.s[0] -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v14.s[0] -mul v18.4S, v18.4S,v7.s[0] -mla v18.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v14.s[0] -mul v17.4S, v17.4S,v7.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v14.s[0] -mul v13.4S, v13.4S,v7.s[0] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v13.4s -add v9.4s, v9.4s, v13.4s -sqrdmulh v13.4S, v3.4S, v14.s[1] -mul v3.4S, v3.4S,v7.s[1] -mla v3.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -ldr q3, [x17, #+384] -ldr q15, [x17, #+400] -ldr q12, [x17, #+416] -ldr q5, [x17, #+432] -sqrdmulh v16.4S, v2.4S, v14.s[1] -mul v2.4S, v2.4S,v7.s[1] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v14.s[2] -mul v6.4S, v6.4S,v7.s[2] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v6.4s -add v17.4s, v17.4s, v6.4s -sqrdmulh v6.4S, v19.4S, v14.s[2] -mul v19.4S, v19.4S,v7.s[2] -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v9.4S, v10.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -str q1, [x0, #512] -str q19, [x0, #528] -sqrdmulh v19.4S, v13.4S, v10.s[1] -mul v13.4S, v13.4S,v4.s[1] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -str q16, [x0, #544] -str q19, [x0, #560] -sqrdmulh v19.4S, v2.4S, v10.s[3] -mul v2.4S, v2.4S,v4.s[3] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v2.4s -add v6.4s, v6.4s, v2.4s -str q6, [x0, #608] -str q19, [x0, #624] -sqrdmulh v19.4S, v17.4S, v10.s[2] -mul v17.4S, v17.4S,v4.s[2] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -str q18, [x0, #576] -str q19, [x0, #592] -ldr q11, [x0, #752] -ldr q0, [x0, #736] -ldr q20, [x0, #704] -ldr q8, [x0, #720] -ldr q19, [x0, #688] -ldr q18, [x0, #672] -ldr q17, [x0, #640] -ldr q6, [x0, #656] -sqrdmulh v2.4S, v11.4S, v15.s[0] -mul v11.4S, v11.4S,v3.s[0] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v19.4s, v11.4s -add v19.4s, v19.4s, v11.4s -sqrdmulh v11.4S, v0.4S, v15.s[0] -mul v0.4S, v0.4S,v3.s[0] -mla v0.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v15.s[0] -mul v20.4S, v20.4S,v3.s[0] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v15.s[0] -mul v8.4S, v8.4S,v3.s[0] -mla v8.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v15.s[1] -mul v19.4S, v19.4S,v3.s[1] -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -ldr q19, [x17, #+448] -ldr q16, [x17, #+464] -ldr q13, [x17, #+480] -ldr q1, [x17, #+496] -sqrdmulh v9.4S, v18.4S, v15.s[1] -mul v18.4S, v18.4S,v3.s[1] -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v18.4s -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v15.s[2] -mul v2.4S, v2.4S,v3.s[2] -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v11.4S, v15.s[2] -mul v11.4S, v11.4S,v3.s[2] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v6.4S, v5.s[0] -mul v6.4S, v6.4S,v12.s[0] -mla v6.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v6.4s -add v17.4s, v17.4s, v6.4s -str q17, [x0, #640] -str q11, [x0, #656] -sqrdmulh v11.4S, v8.4S, v5.s[1] -mul v8.4S, v8.4S,v12.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -str q9, [x0, #672] -str q11, [x0, #688] -sqrdmulh v11.4S, v18.4S, v5.s[3] -mul v18.4S, v18.4S,v12.s[3] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -str q2, [x0, #736] -str q11, [x0, #752] -sqrdmulh v11.4S, v20.4S, v5.s[2] -mul v20.4S, v20.4S,v12.s[2] -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -str q0, [x0, #704] -str q11, [x0, #720] -ldr q10, [x0, #880] -ldr q4, [x0, #864] -ldr q14, [x0, #832] -ldr q7, [x0, #848] -ldr q11, [x0, #816] -ldr q0, [x0, #800] -ldr q20, [x0, #768] -ldr q2, [x0, #784] -sqrdmulh v18.4S, v10.4S, v16.s[0] -mul v10.4S, v10.4S,v19.s[0] -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v16.s[0] -mul v4.4S, v4.4S,v19.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v14.4S, v16.s[0] -mul v14.4S, v14.4S,v19.s[0] -mla v14.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v14.4s -add v20.4s, v20.4s, v14.4s -sqrdmulh v14.4S, v7.4S, v16.s[0] -mul v7.4S, v7.4S,v19.s[0] -mla v7.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v11.4S, v16.s[1] -mul v11.4S, v11.4S,v19.s[1] -mla v11.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v11.4s -add v2.4s, v2.4s, v11.4s -ldr q11, [x17, #+512] -ldr q9, [x17, #+528] -ldr q8, [x17, #+544] -ldr q17, [x17, #+560] -sqrdmulh v6.4S, v0.4S, v16.s[1] -mul v0.4S, v0.4S,v19.s[1] -mla v0.4S, v6.4S, v31.s[0] -sub v6.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -sqrdmulh v0.4S, v18.4S, v16.s[2] -mul v18.4S, v18.4S,v19.s[2] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v16.s[2] -mul v10.4S, v10.4S,v19.s[2] -mla v10.4S, v18.4S, v31.s[0] -sub v18.4s, v4.4s, v10.4s -add v4.4s, v4.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v1.s[0] -mul v2.4S, v2.4S,v13.s[0] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -str q20, [x0, #768] -str q10, [x0, #784] -sqrdmulh v10.4S, v7.4S, v1.s[1] -mul v7.4S, v7.4S,v13.s[1] -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -str q6, [x0, #800] -str q10, [x0, #816] -sqrdmulh v10.4S, v0.4S, v1.s[3] -mul v0.4S, v0.4S,v13.s[3] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -str q18, [x0, #864] -str q10, [x0, #880] -sqrdmulh v10.4S, v14.4S, v1.s[2] -mul v14.4S, v14.4S,v13.s[2] -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -str q4, [x0, #832] -str q10, [x0, #848] -ldr q5, [x0, #1008] -ldr q12, [x0, #992] -ldr q15, [x0, #960] -ldr q3, [x0, #976] -ldr q10, [x0, #944] -ldr q4, [x0, #928] -ldr q14, [x0, #896] -ldr q18, [x0, #912] -sqrdmulh v0.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v11.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v5.4S, v12.4S, v9.s[0] -mul v12.4S, v12.4S,v11.s[0] -mla v12.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v12.4s -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v9.s[0] -mul v15.4S, v15.4S,v11.s[0] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v9.s[0] -mul v3.4S, v3.4S,v11.s[0] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v9.s[1] -mul v10.4S, v10.4S,v11.s[1] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v9.s[1] -mul v4.4S, v4.4S,v11.s[1] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v9.s[2] -mul v0.4S, v0.4S,v11.s[2] -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v9.s[2] -mul v5.4S, v5.4S,v11.s[2] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v5.4s -add v12.4s, v12.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v17.s[0] -mul v18.4S, v18.4S,v8.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -str q14, [x0, #896] -str q5, [x0, #912] -sqrdmulh v5.4S, v3.4S, v17.s[1] -mul v3.4S, v3.4S,v8.s[1] -mla v3.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v3.4s -add v10.4s, v10.4s, v3.4s -str q10, [x0, #928] -str q5, [x0, #944] -sqrdmulh v5.4S, v4.4S, v17.s[3] -mul v4.4S, v4.4S,v8.s[3] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -str q0, [x0, #992] -str q5, [x0, #1008] -sqrdmulh v5.4S, v15.4S, v17.s[2] -mul v15.4S, v15.4S,v8.s[2] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v15.4s -add v12.4s, v12.4s, v15.4s -str q12, [x0, #960] -str q5, [x0, #976] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1444 -// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s deleted file mode 100644 index bef74a7..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_1.s +++ /dev/null @@ -1,1474 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 23825509 // Layer 4, block 0 -.word 27028662 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 1307297022 // Layer 3, block 0 -.word 1524716204 // Layer 4, block 0 -.word 1729702351 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 14626653 // Layer 3, block 1 -.word 14833295 // Layer 4, block 2 -.word 2138810 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 936034350 // Layer 3, block 1 -.word 949258429 // Layer 4, block 2 -.word 136873393 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 29737761 // Layer 3, block 2 -.word 6490403 // Layer 4, block 4 -.word 19648405 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 1903071454 // Layer 3, block 2 -.word 415354091 // Layer 4, block 4 -.word 1257401950 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 30285189 // Layer 3, block 3 -.word 31254932 // Layer 4, block 6 -.word 26362414 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 1938104173 // Layer 3, block 3 -.word 2000162988 // Layer 4, block 6 -.word 1687065733 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 21289485 // Layer 3, block 4 -.word 572895 // Layer 4, block 8 -.word 26691971 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 1362423055 // Layer 3, block 4 -.word 36662482 // Layer 4, block 8 -.word 1708155771 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 9914896 // Layer 3, block 5 -.word 9249292 // Layer 4, block 10 -.word 29292862 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 634504916 // Layer 3, block 5 -.word 591909511 // Layer 4, block 10 -.word 1874600091 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 22603682 // Layer 3, block 6 -.word 8247799 // Layer 4, block 12 -.word 5086187 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 1446525244 // Layer 3, block 6 -.word 527818851 // Layer 4, block 12 -.word 325491125 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 16204162 // Layer 3, block 7 -.word 28113639 // Layer 4, block 14 -.word 8471290 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 1036987221 // Layer 3, block 7 -.word 1799135579 // Layer 4, block 14 -.word 542121183 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.text -.global ntt_u32_incomplete_neon_asm_var_3_3_1 -.global _ntt_u32_incomplete_neon_asm_var_3_3_1 -ntt_u32_incomplete_neon_asm_var_3_3_1: -_ntt_u32_incomplete_neon_asm_var_3_3_1: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x0, #960] -ldr q25, [x0, #832] -sqrdmulh v24.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q23, [x0, #576] -ldr q22, [x0, #704] -sqrdmulh v21.4S, v25.4S, v29.s[0] -mla v26.4S, v24.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q24, [x0, #448] -ldr q20, [x0, #320] -sqrdmulh v19.4S, v23.4S, v29.s[0] -sub v18.4s, v24.4s, v26.4s -mla v25.4S, v21.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -ldr q26, [x0, #64] -ldr q21, [x0, #192] -sqrdmulh v17.4S, v22.4S, v29.s[0] -sub v16.4s, v20.4s, v25.4s -mla v23.4S, v19.4S, v31.s[0] -mul v22.4S, v22.4S,v30.s[0] -add v20.4s, v20.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v29.s[1] -sub v19.4s, v26.4s, v23.4s -mla v22.4S, v17.4S, v31.s[0] -mul v24.4S, v24.4S,v30.s[1] -add v26.4s, v26.4s, v23.4s -sqrdmulh v23.4S, v20.4S, v29.s[1] -sub v17.4s, v21.4s, v22.4s -mla v24.4S, v25.4S, v31.s[0] -mul v20.4S, v20.4S,v30.s[1] -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v29.s[2] -sub v25.4s, v21.4s, v24.4s -mla v20.4S, v23.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[2] -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v16.4S, v29.s[2] -sub v23.4s, v26.4s, v20.4s -mla v18.4S, v22.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v27.s[0] -sub v22.4s, v17.4s, v18.4s -mla v16.4S, v24.4S, v31.s[0] -mul v21.4S, v21.4S,v28.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v25.4S, v27.s[1] -sub v24.4s, v19.4s, v16.4s -mla v21.4S, v20.4S, v31.s[0] -mul v25.4S, v25.4S,v28.s[1] -add v19.4s, v19.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v27.s[3] -sub v20.4s, v26.4s, v21.4s -mla v25.4S, v18.4S, v31.s[0] -mul v22.4S, v22.4S,v28.s[3] -add v26.4s, v26.4s, v21.4s -str q26, [x0, #64] -str q20, [x0, #192] -sqrdmulh v20.4S, v17.4S, v27.s[2] -sub v26.4s, v23.4s, v25.4s -mla v22.4S, v16.4S, v31.s[0] -mul v17.4S, v17.4S,v28.s[2] -add v23.4s, v23.4s, v25.4s -str q23, [x0, #320] -str q26, [x0, #448] -ldr q26, [x0, #976] -ldr q23, [x0, #848] -sqrdmulh v25.4S, v26.4S, v29.s[0] -sub v16.4s, v24.4s, v22.4s -mla v17.4S, v20.4S, v31.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v24.4s, v24.4s, v22.4s -str q24, [x0, #832] -str q16, [x0, #960] -ldr q16, [x0, #592] -ldr q24, [x0, #720] -sqrdmulh v22.4S, v23.4S, v29.s[0] -sub v20.4s, v19.4s, v17.4s -mla v26.4S, v25.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[0] -add v19.4s, v19.4s, v17.4s -str q19, [x0, #576] -str q20, [x0, #704] -ldr q20, [x0, #464] -ldr q19, [x0, #336] -sqrdmulh v17.4S, v16.4S, v29.s[0] -sub v25.4s, v20.4s, v26.4s -mla v23.4S, v22.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v20.4s, v20.4s, v26.4s -ldr q26, [x0, #80] -ldr q22, [x0, #208] -sqrdmulh v21.4S, v24.4S, v29.s[0] -sub v18.4s, v19.4s, v23.4s -mla v16.4S, v17.4S, v31.s[0] -mul v24.4S, v24.4S,v30.s[0] -add v19.4s, v19.4s, v23.4s -sqrdmulh v23.4S, v20.4S, v29.s[1] -sub v17.4s, v26.4s, v16.4s -mla v24.4S, v21.4S, v31.s[0] -mul v20.4S, v20.4S,v30.s[1] -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v29.s[1] -sub v21.4s, v22.4s, v24.4s -mla v20.4S, v23.4S, v31.s[0] -mul v19.4S, v19.4S,v30.s[1] -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v25.4S, v29.s[2] -sub v23.4s, v22.4s, v20.4s -mla v19.4S, v16.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[2] -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[2] -sub v16.4s, v26.4s, v19.4s -mla v25.4S, v24.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[2] -add v26.4s, v26.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v27.s[0] -sub v24.4s, v21.4s, v25.4s -mla v18.4S, v20.4S, v31.s[0] -mul v22.4S, v22.4S,v28.s[0] -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v23.4S, v27.s[1] -sub v20.4s, v17.4s, v18.4s -mla v22.4S, v19.4S, v31.s[0] -mul v23.4S, v23.4S,v28.s[1] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v24.4S, v27.s[3] -sub v19.4s, v26.4s, v22.4s -mla v23.4S, v25.4S, v31.s[0] -mul v24.4S, v24.4S,v28.s[3] -add v26.4s, v26.4s, v22.4s -str q26, [x0, #80] -str q19, [x0, #208] -sqrdmulh v19.4S, v21.4S, v27.s[2] -sub v26.4s, v16.4s, v23.4s -mla v24.4S, v18.4S, v31.s[0] -mul v21.4S, v21.4S,v28.s[2] -add v16.4s, v16.4s, v23.4s -str q16, [x0, #336] -str q26, [x0, #464] -ldr q26, [x0, #992] -ldr q16, [x0, #864] -sqrdmulh v23.4S, v26.4S, v29.s[0] -sub v18.4s, v20.4s, v24.4s -mla v21.4S, v19.4S, v31.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v20.4s, v20.4s, v24.4s -str q20, [x0, #848] -str q18, [x0, #976] -ldr q18, [x0, #608] -ldr q20, [x0, #736] -sqrdmulh v24.4S, v16.4S, v29.s[0] -sub v19.4s, v17.4s, v21.4s -mla v26.4S, v23.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v17.4s, v17.4s, v21.4s -str q17, [x0, #592] -str q19, [x0, #720] -ldr q19, [x0, #480] -ldr q17, [x0, #352] -sqrdmulh v21.4S, v18.4S, v29.s[0] -sub v23.4s, v19.4s, v26.4s -mla v16.4S, v24.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[0] -add v19.4s, v19.4s, v26.4s -ldr q26, [x0, #96] -ldr q24, [x0, #224] -sqrdmulh v22.4S, v20.4S, v29.s[0] -sub v25.4s, v17.4s, v16.4s -mla v18.4S, v21.4S, v31.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v29.s[1] -sub v21.4s, v26.4s, v18.4s -mla v20.4S, v22.4S, v31.s[0] -mul v19.4S, v19.4S,v30.s[1] -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -sub v22.4s, v24.4s, v20.4s -mla v19.4S, v16.4S, v31.s[0] -mul v17.4S, v17.4S,v30.s[1] -add v24.4s, v24.4s, v20.4s -sqrdmulh v20.4S, v23.4S, v29.s[2] -sub v16.4s, v24.4s, v19.4s -mla v17.4S, v18.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[2] -add v24.4s, v24.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v29.s[2] -sub v18.4s, v26.4s, v17.4s -mla v23.4S, v20.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[2] -add v26.4s, v26.4s, v17.4s -sqrdmulh v17.4S, v24.4S, v27.s[0] -sub v20.4s, v22.4s, v23.4s -mla v25.4S, v19.4S, v31.s[0] -mul v24.4S, v24.4S,v28.s[0] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v27.s[1] -sub v19.4s, v21.4s, v25.4s -mla v24.4S, v17.4S, v31.s[0] -mul v16.4S, v16.4S,v28.s[1] -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v27.s[3] -sub v17.4s, v26.4s, v24.4s -mla v16.4S, v23.4S, v31.s[0] -mul v20.4S, v20.4S,v28.s[3] -add v26.4s, v26.4s, v24.4s -str q26, [x0, #96] -str q17, [x0, #224] -sqrdmulh v17.4S, v22.4S, v27.s[2] -sub v26.4s, v18.4s, v16.4s -mla v20.4S, v25.4S, v31.s[0] -mul v22.4S, v22.4S,v28.s[2] -add v18.4s, v18.4s, v16.4s -str q18, [x0, #352] -str q26, [x0, #480] -ldr q26, [x0, #1008] -ldr q18, [x0, #880] -sqrdmulh v16.4S, v26.4S, v29.s[0] -sub v25.4s, v19.4s, v20.4s -mla v22.4S, v17.4S, v31.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v19.4s, v19.4s, v20.4s -str q19, [x0, #864] -str q25, [x0, #992] -ldr q25, [x0, #624] -ldr q19, [x0, #752] -sqrdmulh v20.4S, v18.4S, v29.s[0] -sub v17.4s, v21.4s, v22.4s -mla v26.4S, v16.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[0] -add v21.4s, v21.4s, v22.4s -str q21, [x0, #608] -str q17, [x0, #736] -ldr q17, [x0, #496] -ldr q21, [x0, #368] -sqrdmulh v22.4S, v25.4S, v29.s[0] -sub v16.4s, v17.4s, v26.4s -mla v18.4S, v20.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[0] -add v17.4s, v17.4s, v26.4s -ldr q26, [x0, #112] -ldr q20, [x0, #240] -sqrdmulh v24.4S, v19.4S, v29.s[0] -sub v23.4s, v21.4s, v18.4s -mla v25.4S, v22.4S, v31.s[0] -mul v19.4S, v19.4S,v30.s[0] -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -sub v22.4s, v26.4s, v25.4s -mla v19.4S, v24.4S, v31.s[0] -mul v17.4S, v17.4S,v30.s[1] -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v21.4S, v29.s[1] -sub v24.4s, v20.4s, v19.4s -mla v17.4S, v18.4S, v31.s[0] -mul v21.4S, v21.4S,v30.s[1] -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v29.s[2] -sub v18.4s, v20.4s, v17.4s -mla v21.4S, v25.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[2] -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v23.4S, v29.s[2] -sub v25.4s, v26.4s, v21.4s -mla v16.4S, v19.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[2] -add v26.4s, v26.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v27.s[0] -sub v19.4s, v24.4s, v16.4s -mla v23.4S, v17.4S, v31.s[0] -mul v20.4S, v20.4S,v28.s[0] -add v24.4s, v24.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v27.s[1] -sub v17.4s, v22.4s, v23.4s -mla v20.4S, v21.4S, v31.s[0] -mul v18.4S, v18.4S,v28.s[1] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v19.4S, v27.s[3] -sub v21.4s, v26.4s, v20.4s -mla v18.4S, v16.4S, v31.s[0] -mul v19.4S, v19.4S,v28.s[3] -add v26.4s, v26.4s, v20.4s -str q26, [x0, #112] -str q21, [x0, #240] -sqrdmulh v21.4S, v24.4S, v27.s[2] -sub v26.4s, v25.4s, v18.4s -mla v19.4S, v23.4S, v31.s[0] -mul v24.4S, v24.4S,v28.s[2] -add v25.4s, v25.4s, v18.4s -str q25, [x0, #368] -str q26, [x0, #496] -ldr q26, [x0, #896] -ldr q25, [x0, #768] -sqrdmulh v18.4S, v26.4S, v29.s[0] -sub v23.4s, v17.4s, v19.4s -mla v24.4S, v21.4S, v31.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v17.4s, v17.4s, v19.4s -str q17, [x0, #880] -str q23, [x0, #1008] -ldr q23, [x0, #512] -ldr q17, [x0, #640] -sqrdmulh v19.4S, v25.4S, v29.s[0] -sub v21.4s, v22.4s, v24.4s -mla v26.4S, v18.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[0] -add v22.4s, v22.4s, v24.4s -str q22, [x0, #624] -str q21, [x0, #752] -ldr q21, [x0, #384] -ldr q22, [x0, #256] -sqrdmulh v24.4S, v23.4S, v29.s[0] -sub v18.4s, v21.4s, v26.4s -mla v25.4S, v19.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[0] -add v21.4s, v21.4s, v26.4s -ldr q26, [x0, #0] -ldr q19, [x0, #128] -sqrdmulh v20.4S, v17.4S, v29.s[0] -sub v16.4s, v22.4s, v25.4s -mla v23.4S, v24.4S, v31.s[0] -mul v17.4S, v17.4S,v30.s[0] -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v21.4S, v29.s[1] -sub v24.4s, v26.4s, v23.4s -mla v17.4S, v20.4S, v31.s[0] -mul v21.4S, v21.4S,v30.s[1] -add v26.4s, v26.4s, v23.4s -sqrdmulh v23.4S, v22.4S, v29.s[1] -sub v20.4s, v19.4s, v17.4s -mla v21.4S, v25.4S, v31.s[0] -mul v22.4S, v22.4S,v30.s[1] -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v29.s[2] -sub v25.4s, v19.4s, v21.4s -mla v22.4S, v23.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[2] -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v29.s[2] -sub v23.4s, v26.4s, v22.4s -mla v18.4S, v17.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[2] -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[0] -sub v17.4s, v20.4s, v18.4s -mla v16.4S, v21.4S, v31.s[0] -mul v19.4S, v19.4S,v28.s[0] -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v25.4S, v27.s[1] -sub v21.4s, v24.4s, v16.4s -mla v19.4S, v22.4S, v31.s[0] -mul v25.4S, v25.4S,v28.s[1] -add v24.4s, v24.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v27.s[3] -sub v22.4s, v26.4s, v19.4s -mla v25.4S, v18.4S, v31.s[0] -mul v17.4S, v17.4S,v28.s[3] -add v26.4s, v26.4s, v19.4s -str q26, [x0, #0] -str q22, [x0, #128] -sqrdmulh v22.4S, v20.4S, v27.s[2] -sub v26.4s, v23.4s, v25.4s -mla v17.4S, v16.4S, v31.s[0] -mul v20.4S, v20.4S,v28.s[2] -add v23.4s, v23.4s, v25.4s -str q23, [x0, #256] -str q26, [x0, #384] -ldr q26, [x0, #912] -ldr q23, [x0, #784] -sqrdmulh v25.4S, v26.4S, v29.s[0] -sub v16.4s, v21.4s, v17.4s -mla v20.4S, v22.4S, v31.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v21.4s, v21.4s, v17.4s -str q21, [x0, #768] -str q16, [x0, #896] -ldr q16, [x0, #528] -ldr q21, [x0, #656] -sqrdmulh v17.4S, v23.4S, v29.s[0] -sub v22.4s, v24.4s, v20.4s -mla v26.4S, v25.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[0] -add v24.4s, v24.4s, v20.4s -str q24, [x0, #512] -str q22, [x0, #640] -ldr q22, [x0, #400] -ldr q24, [x0, #272] -sqrdmulh v20.4S, v16.4S, v29.s[0] -sub v25.4s, v22.4s, v26.4s -mla v23.4S, v17.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v22.4s, v22.4s, v26.4s -ldr q26, [x0, #16] -ldr q17, [x0, #144] -sqrdmulh v19.4S, v21.4S, v29.s[0] -sub v18.4s, v24.4s, v23.4s -mla v16.4S, v20.4S, v31.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v24.4s, v24.4s, v23.4s -sqrdmulh v23.4S, v22.4S, v29.s[1] -sub v20.4s, v26.4s, v16.4s -mla v21.4S, v19.4S, v31.s[0] -mul v22.4S, v22.4S,v30.s[1] -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v24.4S, v29.s[1] -sub v19.4s, v17.4s, v21.4s -mla v22.4S, v23.4S, v31.s[0] -mul v24.4S, v24.4S,v30.s[1] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v25.4S, v29.s[2] -sub v23.4s, v17.4s, v22.4s -mla v24.4S, v16.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[2] -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v29.s[2] -sub v16.4s, v26.4s, v24.4s -mla v25.4S, v21.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[2] -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v17.4S, v27.s[0] -sub v21.4s, v19.4s, v25.4s -mla v18.4S, v22.4S, v31.s[0] -mul v17.4S, v17.4S,v28.s[0] -add v19.4s, v19.4s, v25.4s -sqrdmulh v25.4S, v23.4S, v27.s[1] -sub v22.4s, v20.4s, v18.4s -mla v17.4S, v24.4S, v31.s[0] -mul v23.4S, v23.4S,v28.s[1] -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v27.s[3] -sub v24.4s, v26.4s, v17.4s -mla v23.4S, v25.4S, v31.s[0] -mul v21.4S, v21.4S,v28.s[3] -add v26.4s, v26.4s, v17.4s -str q26, [x0, #16] -str q24, [x0, #144] -sqrdmulh v24.4S, v19.4S, v27.s[2] -sub v26.4s, v16.4s, v23.4s -mla v21.4S, v18.4S, v31.s[0] -mul v19.4S, v19.4S,v28.s[2] -add v16.4s, v16.4s, v23.4s -str q16, [x0, #272] -str q26, [x0, #400] -ldr q26, [x0, #928] -ldr q16, [x0, #800] -sqrdmulh v23.4S, v26.4S, v29.s[0] -sub v18.4s, v22.4s, v21.4s -mla v19.4S, v24.4S, v31.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v22.4s, v22.4s, v21.4s -str q22, [x0, #784] -str q18, [x0, #912] -ldr q18, [x0, #544] -ldr q22, [x0, #672] -sqrdmulh v21.4S, v16.4S, v29.s[0] -sub v24.4s, v20.4s, v19.4s -mla v26.4S, v23.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v20.4s, v20.4s, v19.4s -str q20, [x0, #528] -str q24, [x0, #656] -ldr q24, [x0, #416] -ldr q20, [x0, #288] -sqrdmulh v19.4S, v18.4S, v29.s[0] -sub v23.4s, v24.4s, v26.4s -mla v16.4S, v21.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -ldr q26, [x0, #32] -ldr q21, [x0, #160] -sqrdmulh v17.4S, v22.4S, v29.s[0] -sub v25.4s, v20.4s, v16.4s -mla v18.4S, v19.4S, v31.s[0] -mul v22.4S, v22.4S,v30.s[0] -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v24.4S, v29.s[1] -sub v19.4s, v26.4s, v18.4s -mla v22.4S, v17.4S, v31.s[0] -mul v24.4S, v24.4S,v30.s[1] -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[1] -sub v17.4s, v21.4s, v22.4s -mla v24.4S, v16.4S, v31.s[0] -mul v20.4S, v20.4S,v30.s[1] -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v23.4S, v29.s[2] -sub v16.4s, v21.4s, v24.4s -mla v20.4S, v18.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[2] -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v25.4S, v29.s[2] -sub v18.4s, v26.4s, v20.4s -mla v23.4S, v22.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v27.s[0] -sub v22.4s, v17.4s, v23.4s -mla v25.4S, v24.4S, v31.s[0] -mul v21.4S, v21.4S,v28.s[0] -add v17.4s, v17.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v27.s[1] -sub v24.4s, v19.4s, v25.4s -mla v21.4S, v20.4S, v31.s[0] -mul v16.4S, v16.4S,v28.s[1] -add v19.4s, v19.4s, v25.4s -sqrdmulh v25.4S, v22.4S, v27.s[3] -sub v20.4s, v26.4s, v21.4s -mla v16.4S, v23.4S, v31.s[0] -mul v22.4S, v22.4S,v28.s[3] -add v26.4s, v26.4s, v21.4s -str q26, [x0, #32] -str q20, [x0, #160] -sqrdmulh v20.4S, v17.4S, v27.s[2] -sub v26.4s, v18.4s, v16.4s -mla v22.4S, v25.4S, v31.s[0] -mul v17.4S, v17.4S,v28.s[2] -add v18.4s, v18.4s, v16.4s -str q18, [x0, #288] -str q26, [x0, #416] -ldr q26, [x0, #944] -ldr q18, [x0, #816] -sqrdmulh v16.4S, v26.4S, v29.s[0] -sub v25.4s, v24.4s, v22.4s -mla v17.4S, v20.4S, v31.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v24.4s, v24.4s, v22.4s -str q24, [x0, #800] -str q25, [x0, #928] -ldr q25, [x0, #560] -ldr q24, [x0, #688] -sqrdmulh v22.4S, v18.4S, v29.s[0] -sub v20.4s, v19.4s, v17.4s -mla v26.4S, v16.4S, v31.s[0] -mul v18.4S, v18.4S,v30.s[0] -add v19.4s, v19.4s, v17.4s -str q19, [x0, #544] -str q20, [x0, #672] -ldr q20, [x0, #432] -ldr q19, [x0, #304] -sqrdmulh v17.4S, v25.4S, v29.s[0] -sub v16.4s, v20.4s, v26.4s -mla v18.4S, v22.4S, v31.s[0] -mul v25.4S, v25.4S,v30.s[0] -add v20.4s, v20.4s, v26.4s -ldr q26, [x0, #48] -ldr q22, [x0, #176] -sqrdmulh v21.4S, v24.4S, v29.s[0] -sub v23.4s, v19.4s, v18.4s -mla v25.4S, v17.4S, v31.s[0] -mul v24.4S, v24.4S,v30.s[0] -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[1] -sub v17.4s, v26.4s, v25.4s -mla v24.4S, v21.4S, v31.s[0] -mul v20.4S, v20.4S,v30.s[1] -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v19.4S, v29.s[1] -sub v21.4s, v22.4s, v24.4s -mla v20.4S, v18.4S, v31.s[0] -mul v19.4S, v19.4S,v30.s[1] -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v16.4S, v29.s[2] -sub v18.4s, v22.4s, v20.4s -mla v19.4S, v25.4S, v31.s[0] -mul v16.4S, v16.4S,v30.s[2] -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v23.4S, v29.s[2] -sub v25.4s, v26.4s, v19.4s -mla v16.4S, v24.4S, v31.s[0] -mul v23.4S, v23.4S,v30.s[2] -add v26.4s, v26.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v27.s[0] -sub v24.4s, v21.4s, v16.4s -mla v23.4S, v20.4S, v31.s[0] -mul v22.4S, v22.4S,v28.s[0] -add v21.4s, v21.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v27.s[1] -sub v20.4s, v17.4s, v23.4s -mla v22.4S, v19.4S, v31.s[0] -mul v18.4S, v18.4S,v28.s[1] -add v17.4s, v17.4s, v23.4s -sqrdmulh v23.4S, v24.4S, v27.s[3] -sub v19.4s, v26.4s, v22.4s -mla v18.4S, v16.4S, v31.s[0] -mul v24.4S, v24.4S,v28.s[3] -add v26.4s, v26.4s, v22.4s -str q26, [x0, #48] -str q19, [x0, #176] -sqrdmulh v19.4S, v21.4S, v27.s[2] -sub v26.4s, v25.4s, v18.4s -mla v24.4S, v23.4S, v31.s[0] -mul v21.4S, v21.4S,v28.s[2] -add v25.4s, v25.4s, v18.4s -str q25, [x0, #304] -str q26, [x0, #432] -sub v26.4s, v20.4s, v24.4s -mla v21.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v24.4s -str q20, [x0, #816] -str q26, [x0, #944] -sub v26.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -str q17, [x0, #560] -str q26, [x0, #688] -ldr q4, [x17, #+64] -ldr q5, [x17, #+80] -ldr q6, [x17, #+96] -ldr q7, [x17, #+112] -ldr q8, [x0, #112] -ldr q9, [x0, #96] -sqrdmulh v10.4S, v8.4S, v5.s[0] -mul v8.4S, v8.4S,v4.s[0] -ldr q11, [x0, #64] -ldr q12, [x0, #80] -sqrdmulh v13.4S, v9.4S, v5.s[0] -mla v8.4S, v10.4S, v31.s[0] -mul v9.4S, v9.4S,v4.s[0] -ldr q10, [x0, #48] -ldr q14, [x0, #32] -sqrdmulh v15.4S, v11.4S, v5.s[0] -sub v0.4s, v10.4s, v8.4s -mla v9.4S, v13.4S, v31.s[0] -mul v11.4S, v11.4S,v4.s[0] -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #0] -ldr q13, [x0, #16] -sqrdmulh v1.4S, v12.4S, v5.s[0] -sub v2.4s, v14.4s, v9.4s -mla v11.4S, v15.4S, v31.s[0] -mul v12.4S, v12.4S,v4.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v5.s[1] -sub v15.4s, v8.4s, v11.4s -mla v12.4S, v1.4S, v31.s[0] -mul v10.4S, v10.4S,v4.s[1] -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v5.s[1] -sub v1.4s, v13.4s, v12.4s -mla v10.4S, v9.4S, v31.s[0] -mul v14.4S, v14.4S,v4.s[1] -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v0.4S, v5.s[2] -sub v9.4s, v13.4s, v10.4s -mla v14.4S, v11.4S, v31.s[0] -mul v0.4S, v0.4S,v4.s[2] -add v13.4s, v13.4s, v10.4s -ldr q10, [x17, #+128] -ldr q11, [x17, #+144] -ldr q3, [x17, #+160] -ldr q16, [x17, #+176] -sqrdmulh v22.4S, v2.4S, v5.s[2] -sub v23.4s, v8.4s, v14.4s -mla v0.4S, v12.4S, v31.s[0] -mul v2.4S, v2.4S,v4.s[2] -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v13.4S, v7.s[0] -sub v12.4s, v1.4s, v0.4s -mla v2.4S, v22.4S, v31.s[0] -mul v13.4S, v13.4S,v6.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v7.s[1] -sub v22.4s, v15.4s, v2.4s -mla v13.4S, v14.4S, v31.s[0] -mul v9.4S, v9.4S,v6.s[1] -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v7.s[3] -sub v14.4s, v8.4s, v13.4s -mla v9.4S, v0.4S, v31.s[0] -mul v12.4S, v12.4S,v6.s[3] -add v8.4s, v8.4s, v13.4s -str q8, [x0, #0] -str q14, [x0, #16] -sqrdmulh v14.4S, v1.4S, v7.s[2] -sub v8.4s, v23.4s, v9.4s -mla v12.4S, v2.4S, v31.s[0] -mul v1.4S, v1.4S,v6.s[2] -add v23.4s, v23.4s, v9.4s -str q23, [x0, #32] -str q8, [x0, #48] -ldr q8, [x0, #240] -ldr q23, [x0, #224] -sqrdmulh v9.4S, v8.4S, v11.s[0] -sub v2.4s, v22.4s, v12.4s -mla v1.4S, v14.4S, v31.s[0] -mul v8.4S, v8.4S,v10.s[0] -add v22.4s, v22.4s, v12.4s -str q22, [x0, #96] -str q2, [x0, #112] -ldr q2, [x0, #192] -ldr q22, [x0, #208] -sqrdmulh v12.4S, v23.4S, v11.s[0] -sub v14.4s, v15.4s, v1.4s -mla v8.4S, v9.4S, v31.s[0] -mul v23.4S, v23.4S,v10.s[0] -add v15.4s, v15.4s, v1.4s -str q15, [x0, #64] -str q14, [x0, #80] -ldr q7, [x0, #176] -ldr q6, [x0, #160] -sqrdmulh v5.4S, v2.4S, v11.s[0] -sub v4.4s, v7.4s, v8.4s -mla v23.4S, v12.4S, v31.s[0] -mul v2.4S, v2.4S,v10.s[0] -add v7.4s, v7.4s, v8.4s -ldr q8, [x0, #128] -ldr q12, [x0, #144] -sqrdmulh v14.4S, v22.4S, v11.s[0] -sub v15.4s, v6.4s, v23.4s -mla v2.4S, v5.4S, v31.s[0] -mul v22.4S, v22.4S,v10.s[0] -add v6.4s, v6.4s, v23.4s -sqrdmulh v23.4S, v7.4S, v11.s[1] -sub v5.4s, v8.4s, v2.4s -mla v22.4S, v14.4S, v31.s[0] -mul v7.4S, v7.4S,v10.s[1] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v11.s[1] -sub v14.4s, v12.4s, v22.4s -mla v7.4S, v23.4S, v31.s[0] -mul v6.4S, v6.4S,v10.s[1] -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v4.4S, v11.s[2] -sub v23.4s, v12.4s, v7.4s -mla v6.4S, v2.4S, v31.s[0] -mul v4.4S, v4.4S,v10.s[2] -add v12.4s, v12.4s, v7.4s -ldr q7, [x17, #+192] -ldr q2, [x17, #+208] -ldr q1, [x17, #+224] -ldr q9, [x17, #+240] -sqrdmulh v13.4S, v15.4S, v11.s[2] -sub v0.4s, v8.4s, v6.4s -mla v4.4S, v22.4S, v31.s[0] -mul v15.4S, v15.4S,v10.s[2] -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v12.4S, v16.s[0] -sub v22.4s, v14.4s, v4.4s -mla v15.4S, v13.4S, v31.s[0] -mul v12.4S, v12.4S,v3.s[0] -add v14.4s, v14.4s, v4.4s -sqrdmulh v4.4S, v23.4S, v16.s[1] -sub v13.4s, v5.4s, v15.4s -mla v12.4S, v6.4S, v31.s[0] -mul v23.4S, v23.4S,v3.s[1] -add v5.4s, v5.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v16.s[3] -sub v6.4s, v8.4s, v12.4s -mla v23.4S, v4.4S, v31.s[0] -mul v22.4S, v22.4S,v3.s[3] -add v8.4s, v8.4s, v12.4s -str q8, [x0, #128] -str q6, [x0, #144] -sqrdmulh v6.4S, v14.4S, v16.s[2] -sub v8.4s, v0.4s, v23.4s -mla v22.4S, v15.4S, v31.s[0] -mul v14.4S, v14.4S,v3.s[2] -add v0.4s, v0.4s, v23.4s -str q0, [x0, #160] -str q8, [x0, #176] -ldr q8, [x0, #368] -ldr q0, [x0, #352] -sqrdmulh v23.4S, v8.4S, v2.s[0] -sub v15.4s, v13.4s, v22.4s -mla v14.4S, v6.4S, v31.s[0] -mul v8.4S, v8.4S,v7.s[0] -add v13.4s, v13.4s, v22.4s -str q13, [x0, #224] -str q15, [x0, #240] -ldr q15, [x0, #320] -ldr q13, [x0, #336] -sqrdmulh v22.4S, v0.4S, v2.s[0] -sub v6.4s, v5.4s, v14.4s -mla v8.4S, v23.4S, v31.s[0] -mul v0.4S, v0.4S,v7.s[0] -add v5.4s, v5.4s, v14.4s -str q5, [x0, #192] -str q6, [x0, #208] -ldr q16, [x0, #304] -ldr q3, [x0, #288] -sqrdmulh v11.4S, v15.4S, v2.s[0] -sub v10.4s, v16.4s, v8.4s -mla v0.4S, v22.4S, v31.s[0] -mul v15.4S, v15.4S,v7.s[0] -add v16.4s, v16.4s, v8.4s -ldr q8, [x0, #256] -ldr q22, [x0, #272] -sqrdmulh v6.4S, v13.4S, v2.s[0] -sub v5.4s, v3.4s, v0.4s -mla v15.4S, v11.4S, v31.s[0] -mul v13.4S, v13.4S,v7.s[0] -add v3.4s, v3.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v2.s[1] -sub v11.4s, v8.4s, v15.4s -mla v13.4S, v6.4S, v31.s[0] -mul v16.4S, v16.4S,v7.s[1] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v2.s[1] -sub v6.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -mul v3.4S, v3.4S,v7.s[1] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v2.s[2] -sub v0.4s, v22.4s, v16.4s -mla v3.4S, v15.4S, v31.s[0] -mul v10.4S, v10.4S,v7.s[2] -add v22.4s, v22.4s, v16.4s -ldr q16, [x17, #+256] -ldr q15, [x17, #+272] -ldr q14, [x17, #+288] -ldr q23, [x17, #+304] -sqrdmulh v12.4S, v5.4S, v2.s[2] -sub v4.4s, v8.4s, v3.4s -mla v10.4S, v13.4S, v31.s[0] -mul v5.4S, v5.4S,v7.s[2] -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v9.s[0] -sub v13.4s, v6.4s, v10.4s -mla v5.4S, v12.4S, v31.s[0] -mul v22.4S, v22.4S,v1.s[0] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v9.s[1] -sub v12.4s, v11.4s, v5.4s -mla v22.4S, v3.4S, v31.s[0] -mul v0.4S, v0.4S,v1.s[1] -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v9.s[3] -sub v3.4s, v8.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -mul v13.4S, v13.4S,v1.s[3] -add v8.4s, v8.4s, v22.4s -str q8, [x0, #256] -str q3, [x0, #272] -sqrdmulh v3.4S, v6.4S, v9.s[2] -sub v8.4s, v4.4s, v0.4s -mla v13.4S, v5.4S, v31.s[0] -mul v6.4S, v6.4S,v1.s[2] -add v4.4s, v4.4s, v0.4s -str q4, [x0, #288] -str q8, [x0, #304] -ldr q8, [x0, #496] -ldr q4, [x0, #480] -sqrdmulh v0.4S, v8.4S, v15.s[0] -sub v5.4s, v12.4s, v13.4s -mla v6.4S, v3.4S, v31.s[0] -mul v8.4S, v8.4S,v16.s[0] -add v12.4s, v12.4s, v13.4s -str q12, [x0, #352] -str q5, [x0, #368] -ldr q5, [x0, #448] -ldr q12, [x0, #464] -sqrdmulh v13.4S, v4.4S, v15.s[0] -sub v3.4s, v11.4s, v6.4s -mla v8.4S, v0.4S, v31.s[0] -mul v4.4S, v4.4S,v16.s[0] -add v11.4s, v11.4s, v6.4s -str q11, [x0, #320] -str q3, [x0, #336] -ldr q9, [x0, #432] -ldr q1, [x0, #416] -sqrdmulh v2.4S, v5.4S, v15.s[0] -sub v7.4s, v9.4s, v8.4s -mla v4.4S, v13.4S, v31.s[0] -mul v5.4S, v5.4S,v16.s[0] -add v9.4s, v9.4s, v8.4s -ldr q8, [x0, #384] -ldr q13, [x0, #400] -sqrdmulh v3.4S, v12.4S, v15.s[0] -sub v11.4s, v1.4s, v4.4s -mla v5.4S, v2.4S, v31.s[0] -mul v12.4S, v12.4S,v16.s[0] -add v1.4s, v1.4s, v4.4s -sqrdmulh v4.4S, v9.4S, v15.s[1] -sub v2.4s, v8.4s, v5.4s -mla v12.4S, v3.4S, v31.s[0] -mul v9.4S, v9.4S,v16.s[1] -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v15.s[1] -sub v3.4s, v13.4s, v12.4s -mla v9.4S, v4.4S, v31.s[0] -mul v1.4S, v1.4S,v16.s[1] -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v7.4S, v15.s[2] -sub v4.4s, v13.4s, v9.4s -mla v1.4S, v5.4S, v31.s[0] -mul v7.4S, v7.4S,v16.s[2] -add v13.4s, v13.4s, v9.4s -ldr q9, [x17, #+320] -ldr q5, [x17, #+336] -ldr q6, [x17, #+352] -ldr q0, [x17, #+368] -sqrdmulh v22.4S, v11.4S, v15.s[2] -sub v10.4s, v8.4s, v1.4s -mla v7.4S, v12.4S, v31.s[0] -mul v11.4S, v11.4S,v16.s[2] -add v8.4s, v8.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v23.s[0] -sub v12.4s, v3.4s, v7.4s -mla v11.4S, v22.4S, v31.s[0] -mul v13.4S, v13.4S,v14.s[0] -add v3.4s, v3.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v23.s[1] -sub v22.4s, v2.4s, v11.4s -mla v13.4S, v1.4S, v31.s[0] -mul v4.4S, v4.4S,v14.s[1] -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v23.s[3] -sub v1.4s, v8.4s, v13.4s -mla v4.4S, v7.4S, v31.s[0] -mul v12.4S, v12.4S,v14.s[3] -add v8.4s, v8.4s, v13.4s -str q8, [x0, #384] -str q1, [x0, #400] -sqrdmulh v1.4S, v3.4S, v23.s[2] -sub v8.4s, v10.4s, v4.4s -mla v12.4S, v11.4S, v31.s[0] -mul v3.4S, v3.4S,v14.s[2] -add v10.4s, v10.4s, v4.4s -str q10, [x0, #416] -str q8, [x0, #432] -ldr q8, [x0, #624] -ldr q10, [x0, #608] -sqrdmulh v4.4S, v8.4S, v5.s[0] -sub v11.4s, v22.4s, v12.4s -mla v3.4S, v1.4S, v31.s[0] -mul v8.4S, v8.4S,v9.s[0] -add v22.4s, v22.4s, v12.4s -str q22, [x0, #480] -str q11, [x0, #496] -ldr q11, [x0, #576] -ldr q22, [x0, #592] -sqrdmulh v12.4S, v10.4S, v5.s[0] -sub v1.4s, v2.4s, v3.4s -mla v8.4S, v4.4S, v31.s[0] -mul v10.4S, v10.4S,v9.s[0] -add v2.4s, v2.4s, v3.4s -str q2, [x0, #448] -str q1, [x0, #464] -ldr q23, [x0, #560] -ldr q14, [x0, #544] -sqrdmulh v15.4S, v11.4S, v5.s[0] -sub v16.4s, v23.4s, v8.4s -mla v10.4S, v12.4S, v31.s[0] -mul v11.4S, v11.4S,v9.s[0] -add v23.4s, v23.4s, v8.4s -ldr q8, [x0, #512] -ldr q12, [x0, #528] -sqrdmulh v1.4S, v22.4S, v5.s[0] -sub v2.4s, v14.4s, v10.4s -mla v11.4S, v15.4S, v31.s[0] -mul v22.4S, v22.4S,v9.s[0] -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v23.4S, v5.s[1] -sub v15.4s, v8.4s, v11.4s -mla v22.4S, v1.4S, v31.s[0] -mul v23.4S, v23.4S,v9.s[1] -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v5.s[1] -sub v1.4s, v12.4s, v22.4s -mla v23.4S, v10.4S, v31.s[0] -mul v14.4S, v14.4S,v9.s[1] -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v5.s[2] -sub v10.4s, v12.4s, v23.4s -mla v14.4S, v11.4S, v31.s[0] -mul v16.4S, v16.4S,v9.s[2] -add v12.4s, v12.4s, v23.4s -ldr q23, [x17, #+384] -ldr q11, [x17, #+400] -ldr q3, [x17, #+416] -ldr q4, [x17, #+432] -sqrdmulh v13.4S, v2.4S, v5.s[2] -sub v7.4s, v8.4s, v14.4s -mla v16.4S, v22.4S, v31.s[0] -mul v2.4S, v2.4S,v9.s[2] -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v0.s[0] -sub v22.4s, v1.4s, v16.4s -mla v2.4S, v13.4S, v31.s[0] -mul v12.4S, v12.4S,v6.s[0] -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v10.4S, v0.s[1] -sub v13.4s, v15.4s, v2.4s -mla v12.4S, v14.4S, v31.s[0] -mul v10.4S, v10.4S,v6.s[1] -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v22.4S, v0.s[3] -sub v14.4s, v8.4s, v12.4s -mla v10.4S, v16.4S, v31.s[0] -mul v22.4S, v22.4S,v6.s[3] -add v8.4s, v8.4s, v12.4s -str q8, [x0, #512] -str q14, [x0, #528] -sqrdmulh v14.4S, v1.4S, v0.s[2] -sub v8.4s, v7.4s, v10.4s -mla v22.4S, v2.4S, v31.s[0] -mul v1.4S, v1.4S,v6.s[2] -add v7.4s, v7.4s, v10.4s -str q7, [x0, #544] -str q8, [x0, #560] -ldr q8, [x0, #752] -ldr q7, [x0, #736] -sqrdmulh v10.4S, v8.4S, v11.s[0] -sub v2.4s, v13.4s, v22.4s -mla v1.4S, v14.4S, v31.s[0] -mul v8.4S, v8.4S,v23.s[0] -add v13.4s, v13.4s, v22.4s -str q13, [x0, #608] -str q2, [x0, #624] -ldr q2, [x0, #704] -ldr q13, [x0, #720] -sqrdmulh v22.4S, v7.4S, v11.s[0] -sub v14.4s, v15.4s, v1.4s -mla v8.4S, v10.4S, v31.s[0] -mul v7.4S, v7.4S,v23.s[0] -add v15.4s, v15.4s, v1.4s -str q15, [x0, #576] -str q14, [x0, #592] -ldr q0, [x0, #688] -ldr q6, [x0, #672] -sqrdmulh v5.4S, v2.4S, v11.s[0] -sub v9.4s, v0.4s, v8.4s -mla v7.4S, v22.4S, v31.s[0] -mul v2.4S, v2.4S,v23.s[0] -add v0.4s, v0.4s, v8.4s -ldr q8, [x0, #640] -ldr q22, [x0, #656] -sqrdmulh v14.4S, v13.4S, v11.s[0] -sub v15.4s, v6.4s, v7.4s -mla v2.4S, v5.4S, v31.s[0] -mul v13.4S, v13.4S,v23.s[0] -add v6.4s, v6.4s, v7.4s -sqrdmulh v7.4S, v0.4S, v11.s[1] -sub v5.4s, v8.4s, v2.4s -mla v13.4S, v14.4S, v31.s[0] -mul v0.4S, v0.4S,v23.s[1] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v11.s[1] -sub v14.4s, v22.4s, v13.4s -mla v0.4S, v7.4S, v31.s[0] -mul v6.4S, v6.4S,v23.s[1] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v11.s[2] -sub v7.4s, v22.4s, v0.4s -mla v6.4S, v2.4S, v31.s[0] -mul v9.4S, v9.4S,v23.s[2] -add v22.4s, v22.4s, v0.4s -ldr q0, [x17, #+448] -ldr q2, [x17, #+464] -ldr q1, [x17, #+480] -ldr q10, [x17, #+496] -sqrdmulh v12.4S, v15.4S, v11.s[2] -sub v16.4s, v8.4s, v6.4s -mla v9.4S, v13.4S, v31.s[0] -mul v15.4S, v15.4S,v23.s[2] -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v22.4S, v4.s[0] -sub v13.4s, v14.4s, v9.4s -mla v15.4S, v12.4S, v31.s[0] -mul v22.4S, v22.4S,v3.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v4.s[1] -sub v12.4s, v5.4s, v15.4s -mla v22.4S, v6.4S, v31.s[0] -mul v7.4S, v7.4S,v3.s[1] -add v5.4s, v5.4s, v15.4s -sqrdmulh v15.4S, v13.4S, v4.s[3] -sub v6.4s, v8.4s, v22.4s -mla v7.4S, v9.4S, v31.s[0] -mul v13.4S, v13.4S,v3.s[3] -add v8.4s, v8.4s, v22.4s -str q8, [x0, #640] -str q6, [x0, #656] -sqrdmulh v6.4S, v14.4S, v4.s[2] -sub v8.4s, v16.4s, v7.4s -mla v13.4S, v15.4S, v31.s[0] -mul v14.4S, v14.4S,v3.s[2] -add v16.4s, v16.4s, v7.4s -str q16, [x0, #672] -str q8, [x0, #688] -ldr q8, [x0, #880] -ldr q16, [x0, #864] -sqrdmulh v7.4S, v8.4S, v2.s[0] -sub v15.4s, v12.4s, v13.4s -mla v14.4S, v6.4S, v31.s[0] -mul v8.4S, v8.4S,v0.s[0] -add v12.4s, v12.4s, v13.4s -str q12, [x0, #736] -str q15, [x0, #752] -ldr q15, [x0, #832] -ldr q12, [x0, #848] -sqrdmulh v13.4S, v16.4S, v2.s[0] -sub v6.4s, v5.4s, v14.4s -mla v8.4S, v7.4S, v31.s[0] -mul v16.4S, v16.4S,v0.s[0] -add v5.4s, v5.4s, v14.4s -str q5, [x0, #704] -str q6, [x0, #720] -ldr q4, [x0, #816] -ldr q3, [x0, #800] -sqrdmulh v11.4S, v15.4S, v2.s[0] -sub v23.4s, v4.4s, v8.4s -mla v16.4S, v13.4S, v31.s[0] -mul v15.4S, v15.4S,v0.s[0] -add v4.4s, v4.4s, v8.4s -ldr q8, [x0, #768] -ldr q13, [x0, #784] -sqrdmulh v6.4S, v12.4S, v2.s[0] -sub v5.4s, v3.4s, v16.4s -mla v15.4S, v11.4S, v31.s[0] -mul v12.4S, v12.4S,v0.s[0] -add v3.4s, v3.4s, v16.4s -sqrdmulh v16.4S, v4.4S, v2.s[1] -sub v11.4s, v8.4s, v15.4s -mla v12.4S, v6.4S, v31.s[0] -mul v4.4S, v4.4S,v0.s[1] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v2.s[1] -sub v6.4s, v13.4s, v12.4s -mla v4.4S, v16.4S, v31.s[0] -mul v3.4S, v3.4S,v0.s[1] -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v23.4S, v2.s[2] -sub v16.4s, v13.4s, v4.4s -mla v3.4S, v15.4S, v31.s[0] -mul v23.4S, v23.4S,v0.s[2] -add v13.4s, v13.4s, v4.4s -ldr q4, [x17, #+512] -ldr q15, [x17, #+528] -ldr q14, [x17, #+544] -ldr q7, [x17, #+560] -sqrdmulh v22.4S, v5.4S, v2.s[2] -sub v9.4s, v8.4s, v3.4s -mla v23.4S, v12.4S, v31.s[0] -mul v5.4S, v5.4S,v0.s[2] -add v8.4s, v8.4s, v3.4s -sqrdmulh v3.4S, v13.4S, v10.s[0] -sub v12.4s, v6.4s, v23.4s -mla v5.4S, v22.4S, v31.s[0] -mul v13.4S, v13.4S,v1.s[0] -add v6.4s, v6.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v10.s[1] -sub v22.4s, v11.4s, v5.4s -mla v13.4S, v3.4S, v31.s[0] -mul v16.4S, v16.4S,v1.s[1] -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v12.4S, v10.s[3] -sub v3.4s, v8.4s, v13.4s -mla v16.4S, v23.4S, v31.s[0] -mul v12.4S, v12.4S,v1.s[3] -add v8.4s, v8.4s, v13.4s -str q8, [x0, #768] -str q3, [x0, #784] -sqrdmulh v3.4S, v6.4S, v10.s[2] -sub v8.4s, v9.4s, v16.4s -mla v12.4S, v5.4S, v31.s[0] -mul v6.4S, v6.4S,v1.s[2] -add v9.4s, v9.4s, v16.4s -str q9, [x0, #800] -str q8, [x0, #816] -ldr q8, [x0, #1008] -ldr q9, [x0, #992] -sqrdmulh v16.4S, v8.4S, v15.s[0] -sub v5.4s, v22.4s, v12.4s -mla v6.4S, v3.4S, v31.s[0] -mul v8.4S, v8.4S,v4.s[0] -add v22.4s, v22.4s, v12.4s -str q22, [x0, #864] -str q5, [x0, #880] -ldr q5, [x0, #960] -ldr q22, [x0, #976] -sqrdmulh v12.4S, v9.4S, v15.s[0] -sub v3.4s, v11.4s, v6.4s -mla v8.4S, v16.4S, v31.s[0] -mul v9.4S, v9.4S,v4.s[0] -add v11.4s, v11.4s, v6.4s -str q11, [x0, #832] -str q3, [x0, #848] -ldr q10, [x0, #944] -ldr q1, [x0, #928] -sqrdmulh v2.4S, v5.4S, v15.s[0] -sub v0.4s, v10.4s, v8.4s -mla v9.4S, v12.4S, v31.s[0] -mul v5.4S, v5.4S,v4.s[0] -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #896] -ldr q12, [x0, #912] -sqrdmulh v3.4S, v22.4S, v15.s[0] -sub v11.4s, v1.4s, v9.4s -mla v5.4S, v2.4S, v31.s[0] -mul v22.4S, v22.4S,v4.s[0] -add v1.4s, v1.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v15.s[1] -sub v2.4s, v8.4s, v5.4s -mla v22.4S, v3.4S, v31.s[0] -mul v10.4S, v10.4S,v4.s[1] -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v15.s[1] -sub v3.4s, v12.4s, v22.4s -mla v10.4S, v9.4S, v31.s[0] -mul v1.4S, v1.4S,v4.s[1] -add v12.4s, v12.4s, v22.4s -sqrdmulh v22.4S, v0.4S, v15.s[2] -sub v9.4s, v12.4s, v10.4s -mla v1.4S, v5.4S, v31.s[0] -mul v0.4S, v0.4S,v4.s[2] -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v11.4S, v15.s[2] -sub v5.4s, v8.4s, v1.4s -mla v0.4S, v22.4S, v31.s[0] -mul v11.4S, v11.4S,v4.s[2] -add v8.4s, v8.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v7.s[0] -sub v22.4s, v3.4s, v0.4s -mla v11.4S, v10.4S, v31.s[0] -mul v12.4S, v12.4S,v14.s[0] -add v3.4s, v3.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v7.s[1] -sub v10.4s, v2.4s, v11.4s -mla v12.4S, v1.4S, v31.s[0] -mul v9.4S, v9.4S,v14.s[1] -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v7.s[3] -sub v1.4s, v8.4s, v12.4s -mla v9.4S, v0.4S, v31.s[0] -mul v22.4S, v22.4S,v14.s[3] -add v8.4s, v8.4s, v12.4s -str q8, [x0, #896] -str q1, [x0, #912] -sqrdmulh v1.4S, v3.4S, v7.s[2] -sub v8.4s, v5.4s, v9.4s -mla v22.4S, v11.4S, v31.s[0] -mul v3.4S, v3.4S,v14.s[2] -add v5.4s, v5.4s, v9.4s -str q5, [x0, #928] -str q8, [x0, #944] -sub v8.4s, v10.4s, v22.4s -mla v3.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v22.4s -str q10, [x0, #992] -str q8, [x0, #1008] -sub v8.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -str q2, [x0, #960] -str q8, [x0, #976] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1444 -// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s deleted file mode 100644 index 622ba55..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_2.s +++ /dev/null @@ -1,1474 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 23825509 // Layer 4, block 0 -.word 27028662 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 1307297022 // Layer 3, block 0 -.word 1524716204 // Layer 4, block 0 -.word 1729702351 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 14626653 // Layer 3, block 1 -.word 14833295 // Layer 4, block 2 -.word 2138810 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 936034350 // Layer 3, block 1 -.word 949258429 // Layer 4, block 2 -.word 136873393 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 29737761 // Layer 3, block 2 -.word 6490403 // Layer 4, block 4 -.word 19648405 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 1903071454 // Layer 3, block 2 -.word 415354091 // Layer 4, block 4 -.word 1257401950 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 30285189 // Layer 3, block 3 -.word 31254932 // Layer 4, block 6 -.word 26362414 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 1938104173 // Layer 3, block 3 -.word 2000162988 // Layer 4, block 6 -.word 1687065733 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 21289485 // Layer 3, block 4 -.word 572895 // Layer 4, block 8 -.word 26691971 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 1362423055 // Layer 3, block 4 -.word 36662482 // Layer 4, block 8 -.word 1708155771 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 9914896 // Layer 3, block 5 -.word 9249292 // Layer 4, block 10 -.word 29292862 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 634504916 // Layer 3, block 5 -.word 591909511 // Layer 4, block 10 -.word 1874600091 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 22603682 // Layer 3, block 6 -.word 8247799 // Layer 4, block 12 -.word 5086187 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 1446525244 // Layer 3, block 6 -.word 527818851 // Layer 4, block 12 -.word 325491125 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 16204162 // Layer 3, block 7 -.word 28113639 // Layer 4, block 14 -.word 8471290 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 1036987221 // Layer 3, block 7 -.word 1799135579 // Layer 4, block 14 -.word 542121183 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.text -.global ntt_u32_incomplete_neon_asm_var_3_3_2 -.global _ntt_u32_incomplete_neon_asm_var_3_3_2 -ntt_u32_incomplete_neon_asm_var_3_3_2: -_ntt_u32_incomplete_neon_asm_var_3_3_2: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x0, #960] -ldr q25, [x0, #832] -sqrdmulh v24.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q23, [x0, #576] -sqrdmulh v22.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q21, [x0, #704] -mla v26.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v23.4S, v29.s[0] -mul v23.4S, v23.4S,v30.s[0] -ldr q20, [x0, #448] -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v26.4s -add v20.4s, v20.4s, v26.4s -sqrdmulh v26.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -ldr q19, [x0, #320] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v19.4s, v25.4s -add v19.4s, v19.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -ldr q18, [x0, #64] -mla v21.4S, v26.4S, v31.s[0] -sub v26.4s, v18.4s, v23.4s -add v18.4s, v18.4s, v23.4s -sqrdmulh v23.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -ldr q17, [x0, #192] -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v19.4S, v23.4S, v31.s[0] -sub v23.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -mla v24.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v22.4s -add v25.4s, v25.4s, v22.4s -sqrdmulh v22.4S, v23.4S, v27.s[1] -mul v23.4S, v23.4S,v28.s[1] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v20.4S, v27.s[3] -mul v20.4S, v20.4S,v28.s[3] -ldr q16, [x0, #976] -mla v23.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v27.s[2] -mul v25.4S, v25.4S,v28.s[2] -ldr q3, [x0, #848] -mla v20.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v23.4s -add v21.4s, v21.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -ldr q2, [x0, #592] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v3.4S, v29.s[0] -str q18, [x0, #64] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #720] -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[0] -str q22, [x0, #192] -mul v2.4S, v2.4S,v30.s[0] -ldr q22, [x0, #464] -mla v3.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v29.s[0] -str q21, [x0, #320] -mul v18.4S, v18.4S,v30.s[0] -ldr q21, [x0, #336] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v29.s[1] -str q24, [x0, #448] -mul v22.4S, v22.4S,v30.s[1] -ldr q24, [x0, #80] -mla v18.4S, v16.4S, v31.s[0] -sub v16.4s, v24.4s, v2.4s -add v24.4s, v24.4s, v2.4s -sqrdmulh v2.4S, v21.4S, v29.s[1] -str q19, [x0, #832] -mul v21.4S, v21.4S,v30.s[1] -ldr q19, [x0, #208] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -str q17, [x0, #960] -mul v20.4S, v20.4S,v30.s[2] -mla v21.4S, v2.4S, v31.s[0] -sub v2.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v29.s[2] -str q26, [x0, #576] -mul v25.4S, v25.4S,v30.s[2] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v24.4s, v21.4s -add v24.4s, v24.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v27.s[0] -str q23, [x0, #704] -mul v19.4S, v19.4S,v28.s[0] -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v27.s[1] -mul v2.4S, v2.4S,v28.s[1] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -sqrdmulh v25.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -ldr q23, [x0, #992] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v19.4s -add v24.4s, v24.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -ldr q26, [x0, #864] -mla v22.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v23.4S, v29.s[0] -mul v23.4S, v23.4S,v30.s[0] -ldr q17, [x0, #608] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v26.4S, v29.s[0] -str q24, [x0, #80] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #736] -mla v23.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v29.s[0] -str q20, [x0, #208] -mul v17.4S, v17.4S,v30.s[0] -ldr q20, [x0, #480] -mla v26.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v23.4s -add v20.4s, v20.4s, v23.4s -sqrdmulh v23.4S, v24.4S, v29.s[0] -str q18, [x0, #336] -mul v24.4S, v24.4S,v30.s[0] -ldr q18, [x0, #352] -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v26.4s -add v18.4s, v18.4s, v26.4s -sqrdmulh v26.4S, v20.4S, v29.s[1] -str q25, [x0, #464] -mul v20.4S, v20.4S,v30.s[1] -ldr q25, [x0, #96] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v25.4s, v17.4s -add v25.4s, v25.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v29.s[1] -str q21, [x0, #848] -mul v18.4S, v18.4S,v30.s[1] -ldr q21, [x0, #224] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v22.4S, v29.s[2] -str q19, [x0, #976] -mul v22.4S, v22.4S,v30.s[2] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v3.4S, v29.s[2] -str q16, [x0, #592] -mul v3.4S, v3.4S,v30.s[2] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v18.4s -add v25.4s, v25.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v27.s[0] -str q2, [x0, #720] -mul v21.4S, v21.4S,v28.s[0] -mla v3.4S, v20.4S, v31.s[0] -sub v20.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[1] -mul v17.4S, v17.4S,v28.s[1] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v23.4s, v3.4s -add v23.4s, v23.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v27.s[3] -mul v20.4S, v20.4S,v28.s[3] -ldr q2, [x0, #1008] -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v21.4s -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v26.4S, v27.s[2] -mul v26.4S, v26.4S,v28.s[2] -ldr q16, [x0, #880] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v24.4s, v17.4s -add v24.4s, v24.4s, v17.4s -sqrdmulh v17.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -ldr q19, [x0, #624] -mla v26.4S, v21.4S, v31.s[0] -sub v21.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v29.s[0] -str q25, [x0, #96] -mul v16.4S, v16.4S,v30.s[0] -ldr q25, [x0, #752] -mla v2.4S, v17.4S, v31.s[0] -sub v17.4s, v23.4s, v26.4s -add v23.4s, v23.4s, v26.4s -sqrdmulh v26.4S, v19.4S, v29.s[0] -str q22, [x0, #224] -mul v19.4S, v19.4S,v30.s[0] -ldr q22, [x0, #496] -mla v16.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v25.4S, v29.s[0] -str q24, [x0, #352] -mul v25.4S, v25.4S,v30.s[0] -ldr q24, [x0, #368] -mla v19.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v16.4s -add v24.4s, v24.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v29.s[1] -str q3, [x0, #480] -mul v22.4S, v22.4S,v30.s[1] -ldr q3, [x0, #112] -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v29.s[1] -str q18, [x0, #864] -mul v24.4S, v24.4S,v30.s[1] -ldr q18, [x0, #240] -mla v22.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v25.4s -add v18.4s, v18.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v29.s[2] -str q21, [x0, #992] -mul v20.4S, v20.4S,v30.s[2] -mla v24.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v26.4S, v29.s[2] -str q23, [x0, #608] -mul v26.4S, v26.4S,v30.s[2] -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v24.4s -add v3.4s, v3.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v27.s[0] -str q17, [x0, #736] -mul v18.4S, v18.4S,v28.s[0] -mla v26.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -ldr q17, [x0, #896] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v27.s[2] -mul v16.4S, v16.4S,v28.s[2] -ldr q23, [x0, #768] -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v19.4s -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -ldr q21, [x0, #512] -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v24.4s, v22.4s -add v24.4s, v24.4s, v22.4s -sqrdmulh v22.4S, v23.4S, v29.s[0] -str q3, [x0, #112] -mul v23.4S, v23.4S,v30.s[0] -ldr q3, [x0, #640] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v29.s[0] -str q20, [x0, #240] -mul v21.4S, v21.4S,v30.s[0] -ldr q20, [x0, #384] -mla v23.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q25, [x0, #368] -mul v3.4S, v3.4S,v30.s[0] -ldr q25, [x0, #256] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v23.4s -add v25.4s, v25.4s, v23.4s -sqrdmulh v23.4S, v20.4S, v29.s[1] -str q26, [x0, #496] -mul v20.4S, v20.4S,v30.s[1] -ldr q26, [x0, #0] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v21.4s -add v26.4s, v26.4s, v21.4s -sqrdmulh v21.4S, v25.4S, v29.s[1] -str q24, [x0, #880] -mul v25.4S, v25.4S,v30.s[1] -ldr q24, [x0, #128] -mla v20.4S, v23.4S, v31.s[0] -sub v23.4s, v24.4s, v3.4s -add v24.4s, v24.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v29.s[2] -str q18, [x0, #1008] -mul v22.4S, v22.4S,v30.s[2] -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v29.s[2] -str q2, [x0, #624] -mul v16.4S, v16.4S,v30.s[2] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v27.s[0] -str q19, [x0, #752] -mul v24.4S, v24.4S,v28.s[0] -mla v16.4S, v20.4S, v31.s[0] -sub v20.4s, v23.4s, v22.4s -add v23.4s, v23.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v27.s[1] -mul v21.4S, v21.4S,v28.s[1] -mla v24.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v20.4S, v27.s[3] -mul v20.4S, v20.4S,v28.s[3] -ldr q19, [x0, #912] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v23.4S, v27.s[2] -mul v23.4S, v23.4S,v28.s[2] -ldr q2, [x0, #784] -mla v20.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q18, [x0, #528] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v20.4s -add v25.4s, v25.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v29.s[0] -str q26, [x0, #0] -mul v2.4S, v2.4S,v30.s[0] -ldr q26, [x0, #656] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v23.4s -add v17.4s, v17.4s, v23.4s -sqrdmulh v23.4S, v18.4S, v29.s[0] -str q22, [x0, #128] -mul v18.4S, v18.4S,v30.s[0] -ldr q22, [x0, #400] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -sqrdmulh v19.4S, v26.4S, v29.s[0] -str q3, [x0, #256] -mul v26.4S, v26.4S,v30.s[0] -ldr q3, [x0, #272] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v3.4s, v2.4s -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v22.4S, v29.s[1] -str q16, [x0, #384] -mul v22.4S, v22.4S,v30.s[1] -ldr q16, [x0, #16] -mla v26.4S, v19.4S, v31.s[0] -sub v19.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v29.s[1] -str q25, [x0, #768] -mul v3.4S, v3.4S,v30.s[1] -ldr q25, [x0, #144] -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v26.4s -add v25.4s, v25.4s, v26.4s -sqrdmulh v26.4S, v20.4S, v29.s[2] -str q24, [x0, #896] -mul v20.4S, v20.4S,v30.s[2] -mla v3.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v22.4s -add v25.4s, v25.4s, v22.4s -sqrdmulh v22.4S, v23.4S, v29.s[2] -str q17, [x0, #512] -mul v23.4S, v23.4S,v30.s[2] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v27.s[0] -str q21, [x0, #640] -mul v25.4S, v25.4S,v28.s[0] -mla v23.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v20.4s -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v27.s[1] -mul v18.4S, v18.4S,v28.s[1] -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v23.4s -add v19.4s, v19.4s, v23.4s -sqrdmulh v23.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -ldr q21, [x0, #928] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -ldr q17, [x0, #800] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v26.4s, v18.4s -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -ldr q24, [x0, #544] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v29.s[0] -str q16, [x0, #16] -mul v17.4S, v17.4S,v30.s[0] -ldr q16, [x0, #672] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v29.s[0] -str q20, [x0, #144] -mul v24.4S, v24.4S,v30.s[0] -ldr q20, [x0, #416] -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v29.s[0] -str q26, [x0, #272] -mul v16.4S, v16.4S,v30.s[0] -ldr q26, [x0, #288] -mla v24.4S, v2.4S, v31.s[0] -sub v2.4s, v26.4s, v17.4s -add v26.4s, v26.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v29.s[1] -str q23, [x0, #400] -mul v20.4S, v20.4S,v30.s[1] -ldr q23, [x0, #32] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v26.4S, v29.s[1] -str q3, [x0, #784] -mul v26.4S, v26.4S,v30.s[1] -ldr q3, [x0, #160] -mla v20.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v29.s[2] -str q25, [x0, #912] -mul v22.4S, v22.4S,v30.s[2] -mla v26.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v29.s[2] -str q19, [x0, #528] -mul v2.4S, v2.4S,v30.s[2] -mla v22.4S, v16.4S, v31.s[0] -sub v16.4s, v23.4s, v26.4s -add v23.4s, v23.4s, v26.4s -sqrdmulh v26.4S, v3.4S, v27.s[0] -str q18, [x0, #656] -mul v3.4S, v3.4S,v28.s[0] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v24.4S, v27.s[1] -mul v24.4S, v24.4S,v28.s[1] -mla v3.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v27.s[3] -mul v20.4S, v20.4S,v28.s[3] -ldr q18, [x0, #944] -mla v24.4S, v22.4S, v31.s[0] -sub v22.4s, v23.4s, v3.4s -add v23.4s, v23.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -ldr q19, [x0, #816] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v24.4s -add v16.4s, v16.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q25, [x0, #560] -mla v17.4S, v3.4S, v31.s[0] -sub v3.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q23, [x0, #32] -mul v19.4S, v19.4S,v30.s[0] -ldr q23, [x0, #688] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v29.s[0] -str q22, [x0, #160] -mul v25.4S, v25.4S,v30.s[0] -ldr q22, [x0, #432] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v23.4S, v29.s[0] -str q16, [x0, #288] -mul v23.4S, v23.4S,v30.s[0] -ldr q16, [x0, #304] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v19.4s -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v29.s[1] -str q2, [x0, #416] -mul v22.4S, v22.4S,v30.s[1] -ldr q2, [x0, #48] -mla v23.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v25.4s -add v2.4s, v2.4s, v25.4s -sqrdmulh v25.4S, v16.4S, v29.s[1] -str q26, [x0, #800] -mul v16.4S, v16.4S,v30.s[1] -ldr q26, [x0, #176] -mla v22.4S, v19.4S, v31.s[0] -sub v19.4s, v26.4s, v23.4s -add v26.4s, v26.4s, v23.4s -sqrdmulh v23.4S, v20.4S, v29.s[2] -str q3, [x0, #928] -mul v20.4S, v20.4S,v30.s[2] -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v29.s[2] -str q21, [x0, #544] -mul v17.4S, v17.4S,v30.s[2] -mla v20.4S, v23.4S, v31.s[0] -sub v23.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v27.s[0] -str q24, [x0, #672] -mul v26.4S, v26.4S,v28.s[0] -mla v17.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v25.4S, v27.s[1] -mul v25.4S, v25.4S,v28.s[1] -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v27.s[3] -mul v22.4S, v22.4S,v28.s[3] -mla v25.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v23.4s, v25.4s -add v23.4s, v23.4s, v25.4s -mla v19.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v22.4s -add v16.4s, v16.4s, v22.4s -str q2, [x0, #48] -sub v2.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -str q20, [x0, #176] -str q23, [x0, #304] -str q17, [x0, #432] -str q16, [x0, #816] -str q26, [x0, #944] -str q18, [x0, #560] -str q2, [x0, #688] -ldr q4, [x17, #+64] -ldr q5, [x17, #+80] -ldr q6, [x17, #+96] -ldr q7, [x17, #+112] -ldr q8, [x0, #112] -ldr q9, [x0, #96] -sqrdmulh v10.4S, v8.4S, v5.s[0] -mul v8.4S, v8.4S,v4.s[0] -ldr q11, [x0, #64] -sqrdmulh v12.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v4.s[0] -ldr q13, [x0, #80] -mla v8.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v11.4S, v5.s[0] -mul v11.4S, v11.4S,v4.s[0] -ldr q14, [x0, #48] -mla v9.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v8.4s -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -ldr q15, [x0, #32] -mla v11.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -sqrdmulh v9.4S, v14.4S, v5.s[1] -mul v14.4S, v14.4S,v4.s[1] -ldr q0, [x0, #0] -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v5.s[1] -mul v15.4S, v15.4S,v4.s[1] -ldr q1, [x0, #16] -mla v14.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v13.4s -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v4.s[2] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v5.s[2] -mul v10.4S, v10.4S,v4.s[2] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -ldr q15, [x17, #+128] -sqrdmulh v3.4S, v1.4S, v7.s[0] -mul v1.4S, v1.4S,v6.s[0] -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -ldr q12, [x17, #+144] -sqrdmulh v21.4S, v11.4S, v7.s[1] -mul v11.4S, v11.4S,v6.s[1] -mla v1.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -ldr q10, [x17, #+160] -ldr q24, [x17, #+176] -sqrdmulh v25.4S, v14.4S, v7.s[3] -mul v14.4S, v14.4S,v6.s[3] -ldr q22, [x0, #240] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v1.4S, v9.4S, v7.s[2] -mul v9.4S, v9.4S,v6.s[2] -ldr q19, [x0, #224] -mla v14.4S, v25.4S, v31.s[0] -sub v25.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v12.s[0] -mul v22.4S, v22.4S,v15.s[0] -ldr q30, [x0, #192] -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v12.s[0] -str q0, [x0, #0] -mul v19.4S, v19.4S,v15.s[0] -ldr q0, [x0, #208] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v7.4S, v30.4S, v12.s[0] -str q21, [x0, #16] -mul v30.4S, v30.4S,v15.s[0] -ldr q21, [x0, #176] -mla v19.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v22.4s -add v21.4s, v21.4s, v22.4s -sqrdmulh v22.4S, v0.4S, v12.s[0] -str q13, [x0, #32] -mul v0.4S, v0.4S,v15.s[0] -ldr q13, [x0, #160] -mla v30.4S, v7.4S, v31.s[0] -sub v7.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v12.s[1] -str q25, [x0, #48] -mul v21.4S, v21.4S,v15.s[1] -ldr q25, [x0, #128] -mla v0.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v30.4s -add v25.4s, v25.4s, v30.4s -sqrdmulh v30.4S, v13.4S, v12.s[1] -str q3, [x0, #96] -mul v13.4S, v13.4S,v15.s[1] -ldr q3, [x0, #144] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v0.4s -add v3.4s, v3.4s, v0.4s -sqrdmulh v0.4S, v14.4S, v12.s[2] -str q1, [x0, #112] -mul v14.4S, v14.4S,v15.s[2] -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v7.4S, v12.s[2] -str q8, [x0, #64] -mul v7.4S, v7.4S,v15.s[2] -mla v14.4S, v0.4S, v31.s[0] -sub v0.4s, v25.4s, v13.4s -add v25.4s, v25.4s, v13.4s -ldr q13, [x17, #+192] -sqrdmulh v8.4S, v3.4S, v24.s[0] -str q11, [x0, #80] -mul v3.4S, v3.4S,v10.s[0] -mla v7.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -ldr q14, [x17, #+208] -sqrdmulh v11.4S, v30.4S, v24.s[1] -mul v30.4S, v30.4S,v10.s[1] -mla v3.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v7.4s -add v22.4s, v22.4s, v7.4s -ldr q7, [x17, #+224] -ldr q1, [x17, #+240] -sqrdmulh v6.4S, v21.4S, v24.s[3] -mul v21.4S, v21.4S,v10.s[3] -ldr q5, [x0, #368] -mla v30.4S, v11.4S, v31.s[0] -sub v11.4s, v25.4s, v3.4s -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v24.s[2] -mul v19.4S, v19.4S,v10.s[2] -ldr q4, [x0, #352] -mla v21.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v14.s[0] -mul v5.4S, v5.4S,v13.s[0] -ldr q9, [x0, #320] -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v21.4s -add v8.4s, v8.4s, v21.4s -sqrdmulh v21.4S, v4.4S, v14.s[0] -str q25, [x0, #128] -mul v4.4S, v4.4S,v13.s[0] -ldr q25, [x0, #336] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -sqrdmulh v24.4S, v9.4S, v14.s[0] -str q11, [x0, #144] -mul v9.4S, v9.4S,v13.s[0] -ldr q11, [x0, #304] -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v25.4S, v14.s[0] -str q0, [x0, #160] -mul v25.4S, v25.4S,v13.s[0] -ldr q0, [x0, #288] -mla v9.4S, v24.4S, v31.s[0] -sub v24.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v11.4S, v14.s[1] -str q6, [x0, #176] -mul v11.4S, v11.4S,v13.s[1] -ldr q6, [x0, #256] -mla v25.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v0.4S, v14.s[1] -str q8, [x0, #224] -mul v0.4S, v0.4S,v13.s[1] -ldr q8, [x0, #272] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v8.4s, v25.4s -add v8.4s, v8.4s, v25.4s -sqrdmulh v25.4S, v21.4S, v14.s[2] -str q3, [x0, #240] -mul v21.4S, v21.4S,v13.s[2] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v24.4S, v14.s[2] -str q22, [x0, #192] -mul v24.4S, v24.4S,v13.s[2] -mla v21.4S, v25.4S, v31.s[0] -sub v25.4s, v6.4s, v0.4s -add v6.4s, v6.4s, v0.4s -ldr q0, [x17, #+256] -sqrdmulh v22.4S, v8.4S, v1.s[0] -str q30, [x0, #208] -mul v8.4S, v8.4S,v7.s[0] -mla v24.4S, v11.4S, v31.s[0] -sub v11.4s, v4.4s, v21.4s -add v4.4s, v4.4s, v21.4s -ldr q21, [x17, #+272] -sqrdmulh v30.4S, v9.4S, v1.s[1] -mul v9.4S, v9.4S,v7.s[1] -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v5.4s, v24.4s -add v5.4s, v5.4s, v24.4s -ldr q24, [x17, #+288] -ldr q3, [x17, #+304] -sqrdmulh v10.4S, v11.4S, v1.s[3] -mul v11.4S, v11.4S,v7.s[3] -ldr q12, [x0, #496] -mla v9.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v7.s[2] -ldr q15, [x0, #480] -mla v11.4S, v10.4S, v31.s[0] -sub v10.4s, v25.4s, v9.4s -add v25.4s, v25.4s, v9.4s -sqrdmulh v9.4S, v12.4S, v21.s[0] -mul v12.4S, v12.4S,v0.s[0] -ldr q19, [x0, #448] -mla v4.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v21.s[0] -str q6, [x0, #256] -mul v15.4S, v15.4S,v0.s[0] -ldr q6, [x0, #464] -mla v12.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v4.4s -add v5.4s, v5.4s, v4.4s -sqrdmulh v1.4S, v19.4S, v21.s[0] -str q30, [x0, #272] -mul v19.4S, v19.4S,v0.s[0] -ldr q30, [x0, #432] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v12.4s -add v30.4s, v30.4s, v12.4s -sqrdmulh v12.4S, v6.4S, v21.s[0] -str q25, [x0, #288] -mul v6.4S, v6.4S,v0.s[0] -ldr q25, [x0, #416] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v25.4s, v15.4s -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v30.4S, v21.s[1] -str q10, [x0, #304] -mul v30.4S, v30.4S,v0.s[1] -ldr q10, [x0, #384] -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v19.4s -add v10.4s, v10.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v21.s[1] -str q22, [x0, #352] -mul v25.4S, v25.4S,v0.s[1] -ldr q22, [x0, #400] -mla v30.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v6.4s -add v22.4s, v22.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v21.s[2] -str q8, [x0, #368] -mul v11.4S, v11.4S,v0.s[2] -mla v25.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v30.4s -add v22.4s, v22.4s, v30.4s -sqrdmulh v30.4S, v1.4S, v21.s[2] -str q5, [x0, #320] -mul v1.4S, v1.4S,v0.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -ldr q25, [x17, #+320] -sqrdmulh v5.4S, v22.4S, v3.s[0] -str q9, [x0, #336] -mul v22.4S, v22.4S,v24.s[0] -mla v1.4S, v30.4S, v31.s[0] -sub v30.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -ldr q11, [x17, #+336] -sqrdmulh v9.4S, v19.4S, v3.s[1] -mul v19.4S, v19.4S,v24.s[1] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -ldr q1, [x17, #+352] -ldr q8, [x17, #+368] -sqrdmulh v7.4S, v30.4S, v3.s[3] -mul v30.4S, v30.4S,v24.s[3] -ldr q14, [x0, #624] -mla v19.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sqrdmulh v22.4S, v15.4S, v3.s[2] -mul v15.4S, v15.4S,v24.s[2] -ldr q13, [x0, #608] -mla v30.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v11.s[0] -mul v14.4S, v14.4S,v25.s[0] -ldr q4, [x0, #576] -mla v15.4S, v22.4S, v31.s[0] -sub v22.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v13.4S, v11.s[0] -str q10, [x0, #384] -mul v13.4S, v13.4S,v25.s[0] -ldr q10, [x0, #592] -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v15.4s -add v12.4s, v12.4s, v15.4s -sqrdmulh v3.4S, v4.4S, v11.s[0] -str q9, [x0, #400] -mul v4.4S, v4.4S,v25.s[0] -ldr q9, [x0, #560] -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v9.4s, v14.4s -add v9.4s, v9.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v11.s[0] -str q6, [x0, #416] -mul v10.4S, v10.4S,v25.s[0] -ldr q6, [x0, #544] -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v11.s[1] -str q7, [x0, #432] -mul v9.4S, v9.4S,v25.s[1] -ldr q7, [x0, #512] -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v11.s[1] -str q5, [x0, #480] -mul v6.4S, v6.4S,v25.s[1] -ldr q5, [x0, #528] -mla v9.4S, v13.4S, v31.s[0] -sub v13.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v30.4S, v11.s[2] -str q22, [x0, #496] -mul v30.4S, v30.4S,v25.s[2] -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -sqrdmulh v9.4S, v3.4S, v11.s[2] -str q12, [x0, #448] -mul v3.4S, v3.4S,v25.s[2] -mla v30.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -ldr q6, [x17, #+384] -sqrdmulh v12.4S, v5.4S, v8.s[0] -str q19, [x0, #464] -mul v5.4S, v5.4S,v1.s[0] -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v13.4s, v30.4s -add v13.4s, v13.4s, v30.4s -ldr q30, [x17, #+400] -sqrdmulh v19.4S, v4.4S, v8.s[1] -mul v4.4S, v4.4S,v1.s[1] -mla v5.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -ldr q3, [x17, #+416] -ldr q22, [x17, #+432] -sqrdmulh v24.4S, v9.4S, v8.s[3] -mul v9.4S, v9.4S,v1.s[3] -ldr q21, [x0, #752] -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v8.s[2] -mul v13.4S, v13.4S,v1.s[2] -ldr q0, [x0, #736] -mla v9.4S, v24.4S, v31.s[0] -sub v24.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -sqrdmulh v4.4S, v21.4S, v30.s[0] -mul v21.4S, v21.4S,v6.s[0] -ldr q15, [x0, #704] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v9.4s -add v12.4s, v12.4s, v9.4s -sqrdmulh v9.4S, v0.4S, v30.s[0] -str q7, [x0, #512] -mul v0.4S, v0.4S,v6.s[0] -ldr q7, [x0, #720] -mla v21.4S, v4.4S, v31.s[0] -sub v4.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v8.4S, v15.4S, v30.s[0] -str q19, [x0, #528] -mul v15.4S, v15.4S,v6.s[0] -ldr q19, [x0, #688] -mla v0.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v7.4S, v30.s[0] -str q10, [x0, #544] -mul v7.4S, v7.4S,v6.s[0] -ldr q10, [x0, #672] -mla v15.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v30.s[1] -str q24, [x0, #560] -mul v19.4S, v19.4S,v6.s[1] -ldr q24, [x0, #640] -mla v7.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v15.4s -add v24.4s, v24.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v30.s[1] -str q12, [x0, #608] -mul v10.4S, v10.4S,v6.s[1] -ldr q12, [x0, #656] -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v7.4s -add v12.4s, v12.4s, v7.4s -sqrdmulh v7.4S, v9.4S, v30.s[2] -str q5, [x0, #624] -mul v9.4S, v9.4S,v6.s[2] -mla v10.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -sqrdmulh v19.4S, v8.4S, v30.s[2] -str q14, [x0, #576] -mul v8.4S, v8.4S,v6.s[2] -mla v9.4S, v7.4S, v31.s[0] -sub v7.4s, v24.4s, v10.4s -add v24.4s, v24.4s, v10.4s -ldr q10, [x17, #+448] -sqrdmulh v14.4S, v12.4S, v22.s[0] -str q4, [x0, #592] -mul v12.4S, v12.4S,v3.s[0] -mla v8.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v9.4s -add v0.4s, v0.4s, v9.4s -ldr q9, [x17, #+464] -sqrdmulh v4.4S, v15.4S, v22.s[1] -mul v15.4S, v15.4S,v3.s[1] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -ldr q8, [x17, #+480] -ldr q5, [x17, #+496] -sqrdmulh v1.4S, v19.4S, v22.s[3] -mul v19.4S, v19.4S,v3.s[3] -ldr q11, [x0, #880] -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v0.4S, v22.s[2] -mul v0.4S, v0.4S,v3.s[2] -ldr q25, [x0, #864] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v7.4s, v15.4s -add v7.4s, v7.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v9.s[0] -mul v11.4S, v11.4S,v10.s[0] -ldr q13, [x0, #832] -mla v0.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v9.s[0] -str q24, [x0, #640] -mul v25.4S, v25.4S,v10.s[0] -ldr q24, [x0, #848] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v21.4s, v0.4s -add v21.4s, v21.4s, v0.4s -sqrdmulh v22.4S, v13.4S, v9.s[0] -str q4, [x0, #656] -mul v13.4S, v13.4S,v10.s[0] -ldr q4, [x0, #816] -mla v25.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v24.4S, v9.s[0] -str q7, [x0, #672] -mul v24.4S, v24.4S,v10.s[0] -ldr q7, [x0, #800] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v25.4s -add v7.4s, v7.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v9.s[1] -str q1, [x0, #688] -mul v4.4S, v4.4S,v10.s[1] -ldr q1, [x0, #768] -mla v24.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v13.4s -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v7.4S, v9.s[1] -str q14, [x0, #736] -mul v7.4S, v7.4S,v10.s[1] -ldr q14, [x0, #784] -mla v4.4S, v25.4S, v31.s[0] -sub v25.4s, v14.4s, v24.4s -add v14.4s, v14.4s, v24.4s -sqrdmulh v24.4S, v19.4S, v9.s[2] -str q12, [x0, #752] -mul v19.4S, v19.4S,v10.s[2] -mla v7.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -sqrdmulh v4.4S, v22.4S, v9.s[2] -str q21, [x0, #704] -mul v22.4S, v22.4S,v10.s[2] -mla v19.4S, v24.4S, v31.s[0] -sub v24.4s, v1.4s, v7.4s -add v1.4s, v1.4s, v7.4s -ldr q7, [x17, #+512] -sqrdmulh v21.4S, v14.4S, v5.s[0] -str q15, [x0, #720] -mul v14.4S, v14.4S,v8.s[0] -mla v22.4S, v4.4S, v31.s[0] -sub v4.4s, v25.4s, v19.4s -add v25.4s, v25.4s, v19.4s -ldr q19, [x17, #+528] -sqrdmulh v15.4S, v13.4S, v5.s[1] -mul v13.4S, v13.4S,v8.s[1] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -ldr q22, [x17, #+544] -ldr q12, [x17, #+560] -sqrdmulh v3.4S, v4.4S, v5.s[3] -mul v4.4S, v4.4S,v8.s[3] -ldr q30, [x0, #1008] -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sqrdmulh v14.4S, v25.4S, v5.s[2] -mul v25.4S, v25.4S,v8.s[2] -ldr q6, [x0, #992] -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v19.s[0] -mul v30.4S, v30.4S,v7.s[0] -ldr q0, [x0, #960] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v19.s[0] -str q1, [x0, #768] -mul v6.4S, v6.4S,v7.s[0] -ldr q1, [x0, #976] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v25.4s -add v11.4s, v11.4s, v25.4s -sqrdmulh v5.4S, v0.4S, v19.s[0] -str q15, [x0, #784] -mul v0.4S, v0.4S,v7.s[0] -ldr q15, [x0, #944] -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v30.4s -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v1.4S, v19.s[0] -str q24, [x0, #800] -mul v1.4S, v1.4S,v7.s[0] -ldr q24, [x0, #928] -mla v0.4S, v5.4S, v31.s[0] -sub v5.4s, v24.4s, v6.4s -add v24.4s, v24.4s, v6.4s -sqrdmulh v6.4S, v15.4S, v19.s[1] -str q3, [x0, #816] -mul v15.4S, v15.4S,v7.s[1] -ldr q3, [x0, #896] -mla v1.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v0.4s -add v3.4s, v3.4s, v0.4s -sqrdmulh v0.4S, v24.4S, v19.s[1] -str q21, [x0, #864] -mul v24.4S, v24.4S,v7.s[1] -ldr q21, [x0, #912] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v4.4S, v19.s[2] -str q14, [x0, #880] -mul v4.4S, v4.4S,v7.s[2] -mla v24.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v19.s[2] -str q11, [x0, #832] -mul v5.4S, v5.4S,v7.s[2] -mla v4.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v24.4s -add v3.4s, v3.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v12.s[0] -str q13, [x0, #848] -mul v21.4S, v21.4S,v22.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v12.s[1] -mul v0.4S, v0.4S,v22.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v12.s[3] -mul v15.4S, v15.4S,v22.s[3] -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v6.4S, v12.s[2] -mul v6.4S, v6.4S,v22.s[2] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v0.4s -add v1.4s, v1.4s, v0.4s -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v15.4s -add v24.4s, v24.4s, v15.4s -str q3, [x0, #896] -sub v3.4s, v30.4s, v6.4s -add v30.4s, v30.4s, v6.4s -str q4, [x0, #912] -str q1, [x0, #928] -str q5, [x0, #944] -str q24, [x0, #992] -str q21, [x0, #1008] -str q30, [x0, #960] -str q3, [x0, #976] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1444 -// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s deleted file mode 100644 index eebf2a2..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_3.s +++ /dev/null @@ -1,1474 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 23825509 // Layer 4, block 0 -.word 27028662 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 1307297022 // Layer 3, block 0 -.word 1524716204 // Layer 4, block 0 -.word 1729702351 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 14626653 // Layer 3, block 1 -.word 14833295 // Layer 4, block 2 -.word 2138810 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 936034350 // Layer 3, block 1 -.word 949258429 // Layer 4, block 2 -.word 136873393 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 29737761 // Layer 3, block 2 -.word 6490403 // Layer 4, block 4 -.word 19648405 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 1903071454 // Layer 3, block 2 -.word 415354091 // Layer 4, block 4 -.word 1257401950 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 30285189 // Layer 3, block 3 -.word 31254932 // Layer 4, block 6 -.word 26362414 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 1938104173 // Layer 3, block 3 -.word 2000162988 // Layer 4, block 6 -.word 1687065733 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 21289485 // Layer 3, block 4 -.word 572895 // Layer 4, block 8 -.word 26691971 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 1362423055 // Layer 3, block 4 -.word 36662482 // Layer 4, block 8 -.word 1708155771 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 9914896 // Layer 3, block 5 -.word 9249292 // Layer 4, block 10 -.word 29292862 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 634504916 // Layer 3, block 5 -.word 591909511 // Layer 4, block 10 -.word 1874600091 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 22603682 // Layer 3, block 6 -.word 8247799 // Layer 4, block 12 -.word 5086187 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 1446525244 // Layer 3, block 6 -.word 527818851 // Layer 4, block 12 -.word 325491125 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 16204162 // Layer 3, block 7 -.word 28113639 // Layer 4, block 14 -.word 8471290 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 1036987221 // Layer 3, block 7 -.word 1799135579 // Layer 4, block 14 -.word 542121183 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.text -.global ntt_u32_incomplete_neon_asm_var_3_3_3 -.global _ntt_u32_incomplete_neon_asm_var_3_3_3 -ntt_u32_incomplete_neon_asm_var_3_3_3: -_ntt_u32_incomplete_neon_asm_var_3_3_3: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #960] -ldr q29, [x0, #832] -ldr q28, [x0, #576] -ldr q27, [x0, #704] -ldr q26, [x0, #448] -ldr q25, [x17, #+0] -ldr q24, [x17, #+16] -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -ldr q21, [x0, #320] -ldr q20, [x0, #64] -ldr q19, [x0, #192] -sqrdmulh v18.4S, v30.4S, v24.s[0] -mul v30.4S, v30.4S,v25.s[0] -sqrdmulh v17.4S, v29.4S, v24.s[0] -mul v29.4S, v29.4S,v25.s[0] -mla v30.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v28.4S, v24.s[0] -mul v28.4S, v28.4S,v25.s[0] -ldr q16, [x0, #976] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v24.s[0] -mul v27.4S, v27.4S,v25.s[0] -ldr q3, [x0, #848] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v26.4S, v24.s[1] -mul v26.4S, v26.4S,v25.s[1] -ldr q2, [x0, #592] -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v20.4s, v28.4s -add v20.4s, v20.4s, v28.4s -sqrdmulh v28.4S, v21.4S, v24.s[1] -mul v21.4S, v21.4S,v25.s[1] -ldr q1, [x0, #720] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v24.s[2] -mul v17.4S, v17.4S,v25.s[2] -ldr q0, [x0, #464] -mla v21.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v24.s[2] -mul v18.4S, v18.4S,v25.s[2] -ldr q15, [x0, #336] -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -ldr q14, [x0, #80] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v29.4s, v17.4s -add v29.4s, v29.4s, v17.4s -sqrdmulh v17.4S, v28.4S, v22.s[1] -mul v28.4S, v28.4S,v23.s[1] -ldr q13, [x0, #208] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v22.s[3] -mul v26.4S, v26.4S,v23.s[3] -mla v28.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v22.s[2] -mul v29.4S, v29.4S,v23.s[2] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v16.4S, v24.s[0] -mul v16.4S, v16.4S,v25.s[0] -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v26.4s -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v3.4S, v24.s[0] -mul v3.4S, v3.4S,v25.s[0] -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v2.4S, v24.s[0] -mul v2.4S, v2.4S,v25.s[0] -ldr q12, [x0, #992] -mla v3.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v24.s[0] -mul v1.4S, v1.4S,v25.s[0] -ldr q11, [x0, #864] -mla v2.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v3.4s -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v24.s[1] -str q20, [x0, #64] -mul v0.4S, v0.4S,v25.s[1] -ldr q20, [x0, #608] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v15.4S, v24.s[1] -str q17, [x0, #192] -mul v15.4S, v15.4S,v25.s[1] -ldr q17, [x0, #736] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v26.4S, v24.s[2] -str q27, [x0, #320] -mul v26.4S, v26.4S,v25.s[2] -ldr q27, [x0, #480] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v0.4s -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v29.4S, v24.s[2] -str q18, [x0, #448] -mul v29.4S, v29.4S,v25.s[2] -ldr q18, [x0, #352] -mla v26.4S, v1.4S, v31.s[0] -sub v1.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v13.4S, v22.s[0] -str q21, [x0, #832] -mul v13.4S, v13.4S,v23.s[0] -ldr q21, [x0, #96] -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v2.4S, v22.s[1] -str q19, [x0, #960] -mul v2.4S, v2.4S,v23.s[1] -ldr q19, [x0, #224] -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v22.s[3] -str q30, [x0, #576] -mul v0.4S, v0.4S,v23.s[3] -mla v2.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v3.4S, v22.s[2] -str q28, [x0, #704] -mul v3.4S, v3.4S,v23.s[2] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v3.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v24.s[0] -mul v11.4S, v11.4S,v25.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v24.s[0] -mul v20.4S, v20.4S,v25.s[0] -ldr q28, [x0, #1008] -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v27.4s, v12.4s -add v27.4s, v27.4s, v12.4s -sqrdmulh v12.4S, v17.4S, v24.s[0] -mul v17.4S, v17.4S,v25.s[0] -ldr q30, [x0, #880] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -sqrdmulh v11.4S, v27.4S, v24.s[1] -str q14, [x0, #80] -mul v27.4S, v27.4S,v25.s[1] -ldr q14, [x0, #624] -mla v17.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v24.s[1] -str q26, [x0, #208] -mul v18.4S, v18.4S,v25.s[1] -ldr q26, [x0, #752] -mla v27.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v24.s[2] -str q1, [x0, #336] -mul v0.4S, v0.4S,v25.s[2] -ldr q1, [x0, #496] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v24.s[2] -str q29, [x0, #464] -mul v3.4S, v3.4S,v25.s[2] -ldr q29, [x0, #368] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v22.s[0] -str q15, [x0, #848] -mul v19.4S, v19.4S,v23.s[0] -ldr q15, [x0, #112] -mla v3.4S, v27.4S, v31.s[0] -sub v27.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v22.s[1] -str q13, [x0, #976] -mul v20.4S, v20.4S,v23.s[1] -ldr q13, [x0, #240] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v3.4s -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v27.4S, v22.s[3] -str q16, [x0, #592] -mul v27.4S, v27.4S,v23.s[3] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v11.4S, v22.s[2] -str q2, [x0, #720] -mul v11.4S, v11.4S,v23.s[2] -mla v27.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v24.s[0] -mul v28.4S, v28.4S,v25.s[0] -mla v11.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v30.4S, v24.s[0] -mul v30.4S, v30.4S,v25.s[0] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v24.s[0] -mul v14.4S, v14.4S,v25.s[0] -ldr q2, [x0, #896] -mla v30.4S, v27.4S, v31.s[0] -sub v27.4s, v1.4s, v28.4s -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v24.s[0] -mul v26.4S, v26.4S,v25.s[0] -ldr q16, [x0, #768] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v1.4S, v24.s[1] -str q21, [x0, #96] -mul v1.4S, v1.4S,v25.s[1] -ldr q21, [x0, #512] -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v29.4S, v24.s[1] -str q0, [x0, #224] -mul v29.4S, v29.4S,v25.s[1] -ldr q0, [x0, #640] -mla v1.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v26.4s -add v13.4s, v13.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v24.s[2] -str q17, [x0, #352] -mul v27.4S, v27.4S,v25.s[2] -ldr q17, [x0, #384] -mla v29.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v11.4S, v24.s[2] -str q3, [x0, #480] -mul v11.4S, v11.4S,v25.s[2] -ldr q3, [x0, #256] -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v22.s[0] -str q18, [x0, #864] -mul v13.4S, v13.4S,v23.s[0] -ldr q18, [x0, #0] -mla v11.4S, v1.4S, v31.s[0] -sub v1.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -sqrdmulh v27.4S, v14.4S, v22.s[1] -str q19, [x0, #992] -mul v14.4S, v14.4S,v23.s[1] -ldr q19, [x0, #128] -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v11.4s -add v28.4s, v28.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v22.s[3] -str q12, [x0, #608] -mul v1.4S, v1.4S,v23.s[3] -mla v14.4S, v27.4S, v31.s[0] -sub v27.4s, v15.4s, v13.4s -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v22.s[2] -str q20, [x0, #736] -mul v30.4S, v30.4S,v23.s[2] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v24.s[0] -mul v2.4S, v2.4S,v25.s[0] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sqrdmulh v1.4S, v16.4S, v24.s[0] -mul v16.4S, v16.4S,v25.s[0] -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v21.4S, v24.s[0] -mul v21.4S, v21.4S,v25.s[0] -ldr q20, [x0, #912] -mla v16.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v24.s[0] -mul v0.4S, v0.4S,v25.s[0] -ldr q12, [x0, #784] -mla v21.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v24.s[1] -str q15, [x0, #112] -mul v17.4S, v17.4S,v25.s[1] -ldr q15, [x0, #528] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v24.s[1] -str q27, [x0, #240] -mul v3.4S, v3.4S,v25.s[1] -ldr q27, [x0, #656] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v24.s[2] -str q26, [x0, #368] -mul v1.4S, v1.4S,v25.s[2] -ldr q26, [x0, #400] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v30.4S, v24.s[2] -str q11, [x0, #496] -mul v30.4S, v30.4S,v25.s[2] -ldr q11, [x0, #272] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v22.s[0] -str q29, [x0, #880] -mul v19.4S, v19.4S,v23.s[0] -ldr q29, [x0, #16] -mla v30.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v22.s[1] -str q13, [x0, #1008] -mul v21.4S, v21.4S,v23.s[1] -ldr q13, [x0, #144] -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v30.4s -add v2.4s, v2.4s, v30.4s -sqrdmulh v30.4S, v17.4S, v22.s[3] -str q28, [x0, #624] -mul v17.4S, v17.4S,v23.s[3] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v22.s[2] -str q14, [x0, #752] -mul v16.4S, v16.4S,v23.s[2] -mla v17.4S, v30.4S, v31.s[0] -sub v30.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v24.s[0] -mul v20.4S, v20.4S,v25.s[0] -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v24.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v24.s[0] -mul v15.4S, v15.4S,v25.s[0] -ldr q14, [x0, #928] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v27.4S, v24.s[0] -mul v27.4S, v27.4S,v25.s[0] -ldr q28, [x0, #800] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v24.s[1] -str q18, [x0, #0] -mul v26.4S, v26.4S,v25.s[1] -ldr q18, [x0, #544] -mla v27.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v24.s[1] -str q1, [x0, #128] -mul v11.4S, v11.4S,v25.s[1] -ldr q1, [x0, #672] -mla v26.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v27.4s -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v24.s[2] -str q0, [x0, #256] -mul v17.4S, v17.4S,v25.s[2] -ldr q0, [x0, #416] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v13.4s, v26.4s -add v13.4s, v13.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v24.s[2] -str q30, [x0, #384] -mul v16.4S, v16.4S,v25.s[2] -ldr q30, [x0, #288] -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v22.s[0] -str q3, [x0, #768] -mul v13.4S, v13.4S,v23.s[0] -ldr q3, [x0, #32] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -sqrdmulh v17.4S, v15.4S, v22.s[1] -str q19, [x0, #896] -mul v15.4S, v15.4S,v23.s[1] -ldr q19, [x0, #160] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v22.s[3] -str q2, [x0, #512] -mul v26.4S, v26.4S,v23.s[3] -mla v15.4S, v17.4S, v31.s[0] -sub v17.4s, v29.4s, v13.4s -add v29.4s, v29.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v22.s[2] -str q21, [x0, #640] -mul v12.4S, v12.4S,v23.s[2] -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v15.4s -add v27.4s, v27.4s, v15.4s -sqrdmulh v15.4S, v14.4S, v24.s[0] -mul v14.4S, v14.4S,v25.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v26.4s -add v11.4s, v11.4s, v26.4s -sqrdmulh v26.4S, v28.4S, v24.s[0] -mul v28.4S, v28.4S,v25.s[0] -mla v14.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v24.s[0] -mul v18.4S, v18.4S,v25.s[0] -ldr q21, [x0, #944] -mla v28.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v1.4S, v24.s[0] -mul v1.4S, v1.4S,v25.s[0] -ldr q2, [x0, #816] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v0.4S, v24.s[1] -str q29, [x0, #16] -mul v0.4S, v0.4S,v25.s[1] -ldr q29, [x0, #560] -mla v1.4S, v14.4S, v31.s[0] -sub v14.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v24.s[1] -str q17, [x0, #144] -mul v30.4S, v30.4S,v25.s[1] -ldr q17, [x0, #688] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v26.4S, v24.s[2] -str q27, [x0, #272] -mul v26.4S, v26.4S,v25.s[2] -ldr q27, [x0, #432] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v24.s[2] -str q16, [x0, #400] -mul v12.4S, v12.4S,v25.s[2] -ldr q16, [x0, #304] -mla v26.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v30.4s -add v3.4s, v3.4s, v30.4s -sqrdmulh v30.4S, v19.4S, v22.s[0] -str q11, [x0, #784] -mul v19.4S, v19.4S,v23.s[0] -ldr q11, [x0, #48] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v22.s[1] -str q13, [x0, #912] -mul v18.4S, v18.4S,v23.s[1] -ldr q13, [x0, #176] -mla v19.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v0.4S, v22.s[3] -str q20, [x0, #528] -mul v0.4S, v0.4S,v23.s[3] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v22.s[2] -str q15, [x0, #656] -mul v28.4S, v28.4S,v23.s[2] -mla v0.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v24.s[0] -mul v21.4S, v21.4S,v25.s[0] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v0.4s -add v30.4s, v30.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v24.s[0] -mul v2.4S, v2.4S,v25.s[0] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v28.4s -add v14.4s, v14.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v24.s[0] -mul v29.4S, v29.4S,v25.s[0] -mla v2.4S, v0.4S, v31.s[0] -sub v0.4s, v27.4s, v21.4s -add v27.4s, v27.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v24.s[0] -mul v17.4S, v17.4S,v25.s[0] -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v2.4s -add v16.4s, v16.4s, v2.4s -sqrdmulh v2.4S, v27.4S, v24.s[1] -str q3, [x0, #32] -mul v27.4S, v27.4S,v25.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v16.4S, v24.s[1] -str q26, [x0, #160] -mul v16.4S, v16.4S,v25.s[1] -mla v27.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v24.s[2] -str q1, [x0, #288] -mul v0.4S, v0.4S,v25.s[2] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v27.4s -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v24.s[2] -str q12, [x0, #416] -mul v28.4S, v28.4S,v25.s[2] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v22.s[0] -str q30, [x0, #800] -mul v13.4S, v13.4S,v23.s[0] -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v29.4S, v22.s[1] -str q19, [x0, #928] -mul v29.4S, v29.4S,v23.s[1] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v22.s[3] -str q14, [x0, #544] -mul v27.4S, v27.4S,v23.s[3] -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v2.4S, v22.s[2] -str q18, [x0, #672] -mul v2.4S, v2.4S,v23.s[2] -mla v27.4S, v28.4S, v31.s[0] -sub v28.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -mla v2.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sub v27.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -str q11, [x0, #48] -str q0, [x0, #176] -str q17, [x0, #304] -str q28, [x0, #432] -str q16, [x0, #816] -str q13, [x0, #944] -str q21, [x0, #560] -str q27, [x0, #688] -ldr q4, [x0, #112] -ldr q5, [x0, #96] -ldr q6, [x0, #64] -ldr q7, [x0, #80] -ldr q8, [x0, #48] -ldr q9, [x17, #+64] -ldr q10, [x17, #+80] -ldr q20, [x17, #+96] -ldr q15, [x17, #+112] -ldr q3, [x0, #32] -ldr q26, [x0, #0] -ldr q1, [x0, #16] -sqrdmulh v12.4S, v4.4S, v10.s[0] -mul v4.4S, v4.4S,v9.s[0] -sqrdmulh v30.4S, v5.4S, v10.s[0] -mul v5.4S, v5.4S,v9.s[0] -mla v4.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v6.4S, v10.s[0] -mul v6.4S, v6.4S,v9.s[0] -ldr q19, [x0, #240] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v10.s[0] -mul v7.4S, v7.4S,v9.s[0] -ldr q14, [x0, #224] -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v5.4s -add v3.4s, v3.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v10.s[1] -mul v8.4S, v8.4S,v9.s[1] -ldr q18, [x0, #192] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v6.4s -add v26.4s, v26.4s, v6.4s -sqrdmulh v6.4S, v3.4S, v10.s[1] -mul v3.4S, v3.4S,v9.s[1] -ldr q29, [x0, #208] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v7.4s -add v1.4s, v1.4s, v7.4s -sqrdmulh v7.4S, v30.4S, v10.s[2] -mul v30.4S, v30.4S,v9.s[2] -ldr q2, [x0, #176] -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v1.4s, v8.4s -add v1.4s, v1.4s, v8.4s -ldr q8, [x17, #+128] -ldr q25, [x17, #+144] -ldr q24, [x17, #+160] -ldr q23, [x17, #+176] -sqrdmulh v22.4S, v12.4S, v10.s[2] -mul v12.4S, v12.4S,v9.s[2] -ldr q11, [x0, #160] -mla v30.4S, v7.4S, v31.s[0] -sub v7.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v15.s[0] -mul v1.4S, v1.4S,v20.s[0] -ldr q0, [x0, #128] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v6.4S, v15.s[1] -mul v6.4S, v6.4S,v20.s[1] -ldr q17, [x0, #144] -mla v1.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v12.4s -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v15.s[3] -mul v22.4S, v22.4S,v20.s[3] -mla v6.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v1.4s -add v26.4s, v26.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v15.s[2] -mul v5.4S, v5.4S,v20.s[2] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v19.4S, v25.s[0] -mul v19.4S, v19.4S,v8.s[0] -mla v5.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v8.s[0] -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v15.4S, v18.4S, v25.s[0] -mul v18.4S, v18.4S,v8.s[0] -ldr q20, [x0, #368] -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v25.s[0] -mul v29.4S, v29.4S,v8.s[0] -ldr q10, [x0, #352] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v25.s[1] -str q26, [x0, #0] -mul v2.4S, v2.4S,v8.s[1] -ldr q26, [x0, #320] -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v25.s[1] -str q30, [x0, #16] -mul v11.4S, v11.4S,v8.s[1] -ldr q30, [x0, #336] -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v25.s[2] -str q7, [x0, #32] -mul v22.4S, v22.4S,v8.s[2] -ldr q7, [x0, #304] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -ldr q2, [x17, #+192] -ldr q9, [x17, #+208] -ldr q5, [x17, #+224] -ldr q28, [x17, #+240] -sqrdmulh v16.4S, v15.4S, v25.s[2] -str q12, [x0, #48] -mul v15.4S, v15.4S,v8.s[2] -ldr q12, [x0, #288] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v23.s[0] -str q3, [x0, #96] -mul v17.4S, v17.4S,v24.s[0] -ldr q3, [x0, #256] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v23.s[1] -str q1, [x0, #112] -mul v18.4S, v18.4S,v24.s[1] -ldr q1, [x0, #272] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[3] -str q4, [x0, #64] -mul v16.4S, v16.4S,v24.s[3] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v23.s[2] -str q6, [x0, #80] -mul v14.4S, v14.4S,v24.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v18.4s -add v29.4s, v29.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v9.s[0] -mul v20.4S, v20.4S,v2.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v10.4S, v9.s[0] -mul v10.4S, v10.4S,v2.s[0] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v23.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v2.s[0] -ldr q24, [x0, #496] -mla v10.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v20.4s -add v7.4s, v7.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v9.s[0] -mul v30.4S, v30.4S,v2.s[0] -ldr q25, [x0, #480] -mla v26.4S, v23.4S, v31.s[0] -sub v23.4s, v12.4s, v10.4s -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v7.4S, v9.s[1] -str q0, [x0, #128] -mul v7.4S, v7.4S,v2.s[1] -ldr q0, [x0, #448] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v9.s[1] -str q22, [x0, #144] -mul v12.4S, v12.4S,v2.s[1] -ldr q22, [x0, #464] -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v9.s[2] -str q29, [x0, #160] -mul v16.4S, v16.4S,v2.s[2] -ldr q29, [x0, #432] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v1.4s, v7.4s -add v1.4s, v1.4s, v7.4s -ldr q7, [x17, #+256] -ldr q8, [x17, #+272] -ldr q14, [x17, #+288] -ldr q6, [x17, #+304] -sqrdmulh v4.4S, v23.4S, v9.s[2] -str q15, [x0, #176] -mul v23.4S, v23.4S,v2.s[2] -ldr q15, [x0, #416] -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v28.s[0] -str q11, [x0, #224] -mul v1.4S, v1.4S,v5.s[0] -ldr q11, [x0, #384] -mla v23.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v28.s[1] -str q17, [x0, #240] -mul v26.4S, v26.4S,v5.s[1] -ldr q17, [x0, #400] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v20.4s, v23.4s -add v20.4s, v20.4s, v23.4s -sqrdmulh v23.4S, v4.4S, v28.s[3] -str q19, [x0, #192] -mul v4.4S, v4.4S,v5.s[3] -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v1.4s -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v28.s[2] -str q18, [x0, #208] -mul v10.4S, v10.4S,v5.s[2] -mla v4.4S, v23.4S, v31.s[0] -sub v23.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v24.4S, v8.s[0] -mul v24.4S, v24.4S,v7.s[0] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v4.4s -add v12.4s, v12.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v8.s[0] -mul v25.4S, v25.4S,v7.s[0] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v28.4S, v0.4S, v8.s[0] -mul v0.4S, v0.4S,v7.s[0] -ldr q5, [x0, #624] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v24.4S, v22.4S, v8.s[0] -mul v22.4S, v22.4S,v7.s[0] -ldr q9, [x0, #608] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v15.4s, v25.4s -add v15.4s, v15.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v8.s[1] -str q3, [x0, #256] -mul v29.4S, v29.4S,v7.s[1] -ldr q3, [x0, #576] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v8.s[1] -str q16, [x0, #272] -mul v15.4S, v15.4S,v7.s[1] -ldr q16, [x0, #592] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v4.4S, v8.s[2] -str q30, [x0, #288] -mul v4.4S, v4.4S,v7.s[2] -ldr q30, [x0, #560] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -ldr q29, [x17, #+320] -ldr q2, [x17, #+336] -ldr q10, [x17, #+352] -ldr q18, [x17, #+368] -sqrdmulh v19.4S, v28.4S, v8.s[2] -str q23, [x0, #304] -mul v28.4S, v28.4S,v7.s[2] -ldr q23, [x0, #544] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v17.4S, v6.s[0] -str q12, [x0, #352] -mul v17.4S, v17.4S,v14.s[0] -ldr q12, [x0, #512] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v25.4s, v4.4s -add v25.4s, v25.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v6.s[1] -str q1, [x0, #368] -mul v0.4S, v0.4S,v14.s[1] -ldr q1, [x0, #528] -mla v17.4S, v15.4S, v31.s[0] -sub v15.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v6.s[3] -str q20, [x0, #320] -mul v19.4S, v19.4S,v14.s[3] -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v6.s[2] -str q26, [x0, #336] -mul v25.4S, v25.4S,v14.s[2] -mla v19.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v2.s[0] -mul v5.4S, v5.4S,v29.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v9.4S, v2.s[0] -mul v9.4S, v9.4S,v29.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -sqrdmulh v6.4S, v3.4S, v2.s[0] -mul v3.4S, v3.4S,v29.s[0] -ldr q14, [x0, #752] -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v16.4S, v2.s[0] -mul v16.4S, v16.4S,v29.s[0] -ldr q8, [x0, #736] -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v23.4s, v9.4s -add v23.4s, v23.4s, v9.4s -sqrdmulh v9.4S, v30.4S, v2.s[1] -str q11, [x0, #384] -mul v30.4S, v30.4S,v29.s[1] -ldr q11, [x0, #704] -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v3.4s -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v23.4S, v2.s[1] -str q4, [x0, #400] -mul v23.4S, v23.4S,v29.s[1] -ldr q4, [x0, #720] -mla v30.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v2.s[2] -str q22, [x0, #416] -mul v19.4S, v19.4S,v29.s[2] -ldr q22, [x0, #688] -mla v23.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -ldr q30, [x17, #+384] -ldr q7, [x17, #+400] -ldr q25, [x17, #+416] -ldr q26, [x17, #+432] -sqrdmulh v20.4S, v6.4S, v2.s[2] -str q28, [x0, #432] -mul v6.4S, v6.4S,v29.s[2] -ldr q28, [x0, #672] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v23.4s -add v12.4s, v12.4s, v23.4s -sqrdmulh v23.4S, v1.4S, v18.s[0] -str q15, [x0, #480] -mul v1.4S, v1.4S,v10.s[0] -ldr q15, [x0, #640] -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v9.4s, v19.4s -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v18.s[1] -str q17, [x0, #496] -mul v3.4S, v3.4S,v10.s[1] -ldr q17, [x0, #656] -mla v1.4S, v23.4S, v31.s[0] -sub v23.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v20.4S, v18.s[3] -str q24, [x0, #448] -mul v20.4S, v20.4S,v10.s[3] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v9.4S, v18.s[2] -str q0, [x0, #464] -mul v9.4S, v9.4S,v10.s[2] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v7.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v23.4s, v20.4s -add v23.4s, v23.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v7.s[0] -mul v8.4S, v8.4S,v30.s[0] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -sqrdmulh v18.4S, v11.4S, v7.s[0] -mul v11.4S, v11.4S,v30.s[0] -ldr q10, [x0, #880] -mla v8.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v4.4S, v7.s[0] -mul v4.4S, v4.4S,v30.s[0] -ldr q2, [x0, #864] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v7.s[1] -str q12, [x0, #512] -mul v22.4S, v22.4S,v30.s[1] -ldr q12, [x0, #832] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v7.s[1] -str q19, [x0, #528] -mul v28.4S, v28.4S,v30.s[1] -ldr q19, [x0, #848] -mla v22.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v7.s[2] -str q16, [x0, #544] -mul v20.4S, v20.4S,v30.s[2] -ldr q16, [x0, #816] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -ldr q22, [x17, #+448] -ldr q29, [x17, #+464] -ldr q9, [x17, #+480] -ldr q0, [x17, #+496] -sqrdmulh v24.4S, v18.4S, v7.s[2] -str q6, [x0, #560] -mul v18.4S, v18.4S,v30.s[2] -ldr q6, [x0, #800] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v28.4s -add v15.4s, v15.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v26.s[0] -str q23, [x0, #608] -mul v17.4S, v17.4S,v25.s[0] -ldr q23, [x0, #768] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v26.s[1] -str q1, [x0, #624] -mul v11.4S, v11.4S,v25.s[1] -ldr q1, [x0, #784] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v24.4S, v26.s[3] -str q5, [x0, #576] -mul v24.4S, v24.4S,v25.s[3] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v8.4S, v26.s[2] -str q3, [x0, #592] -mul v8.4S, v8.4S,v25.s[2] -mla v24.4S, v18.4S, v31.s[0] -sub v18.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v22.s[0] -mla v8.4S, v17.4S, v31.s[0] -sub v17.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v22.s[0] -mla v10.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v8.4s -add v14.4s, v14.4s, v8.4s -sqrdmulh v26.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v22.s[0] -ldr q25, [x0, #1008] -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v22.s[0] -ldr q7, [x0, #992] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v6.4s, v2.4s -add v6.4s, v6.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v29.s[1] -str q15, [x0, #640] -mul v16.4S, v16.4S,v22.s[1] -ldr q15, [x0, #960] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v23.4s, v12.4s -add v23.4s, v23.4s, v12.4s -sqrdmulh v12.4S, v6.4S, v29.s[1] -str q20, [x0, #656] -mul v6.4S, v6.4S,v22.s[1] -ldr q20, [x0, #976] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v19.4s -add v1.4s, v1.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v29.s[2] -str q4, [x0, #672] -mul v24.4S, v24.4S,v22.s[2] -ldr q4, [x0, #944] -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -ldr q16, [x17, #+512] -ldr q30, [x17, #+528] -ldr q8, [x17, #+544] -ldr q3, [x17, #+560] -sqrdmulh v5.4S, v26.4S, v29.s[2] -str q18, [x0, #688] -mul v26.4S, v26.4S,v22.s[2] -ldr q18, [x0, #928] -mla v24.4S, v19.4S, v31.s[0] -sub v19.4s, v23.4s, v6.4s -add v23.4s, v23.4s, v6.4s -sqrdmulh v6.4S, v1.4S, v0.s[0] -str q28, [x0, #736] -mul v1.4S, v1.4S,v9.s[0] -ldr q28, [x0, #896] -mla v26.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v0.s[1] -str q17, [x0, #752] -mul v12.4S, v12.4S,v9.s[1] -ldr q17, [x0, #912] -mla v1.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v0.s[3] -str q14, [x0, #704] -mul v5.4S, v5.4S,v9.s[3] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v1.4s -add v23.4s, v23.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v0.s[2] -str q11, [x0, #720] -mul v2.4S, v2.4S,v9.s[2] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v12.4s -add v19.4s, v19.4s, v12.4s -sqrdmulh v12.4S, v25.4S, v30.s[0] -mul v25.4S, v25.4S,v16.s[0] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v30.s[0] -mul v7.4S, v7.4S,v16.s[0] -mla v25.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v0.4S, v15.4S, v30.s[0] -mul v15.4S, v15.4S,v16.s[0] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v30.s[0] -mul v20.4S, v20.4S,v16.s[0] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v18.4s, v7.4s -add v18.4s, v18.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v30.s[1] -str q23, [x0, #768] -mul v4.4S, v4.4S,v16.s[1] -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v15.4s -add v28.4s, v28.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v30.s[1] -str q24, [x0, #784] -mul v18.4S, v18.4S,v16.s[1] -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v5.4S, v30.s[2] -str q19, [x0, #800] -mul v5.4S, v5.4S,v16.s[2] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v30.s[2] -str q26, [x0, #816] -mul v0.4S, v0.4S,v16.s[2] -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v3.s[0] -str q6, [x0, #864] -mul v17.4S, v17.4S,v8.s[0] -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v3.s[1] -str q1, [x0, #880] -mul v15.4S, v15.4S,v8.s[1] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v0.4s -add v25.4s, v25.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v3.s[3] -str q10, [x0, #832] -mul v4.4S, v4.4S,v8.s[3] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v28.4s, v17.4s -add v28.4s, v28.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v3.s[2] -str q12, [x0, #848] -mul v7.4S, v7.4S,v8.s[2] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v4.4s -add v18.4s, v18.4s, v4.4s -sub v4.4s, v25.4s, v7.4s -add v25.4s, v25.4s, v7.4s -str q28, [x0, #896] -str q5, [x0, #912] -str q20, [x0, #928] -str q0, [x0, #944] -str q18, [x0, #992] -str q17, [x0, #1008] -str q25, [x0, #960] -str q4, [x0, #976] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1444 -// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s deleted file mode 100644 index ca19281..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_4.s +++ /dev/null @@ -1,1474 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 23825509 // Layer 4, block 0 -.word 27028662 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 1307297022 // Layer 3, block 0 -.word 1524716204 // Layer 4, block 0 -.word 1729702351 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 14626653 // Layer 3, block 1 -.word 14833295 // Layer 4, block 2 -.word 2138810 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 936034350 // Layer 3, block 1 -.word 949258429 // Layer 4, block 2 -.word 136873393 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 29737761 // Layer 3, block 2 -.word 6490403 // Layer 4, block 4 -.word 19648405 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 1903071454 // Layer 3, block 2 -.word 415354091 // Layer 4, block 4 -.word 1257401950 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 30285189 // Layer 3, block 3 -.word 31254932 // Layer 4, block 6 -.word 26362414 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 1938104173 // Layer 3, block 3 -.word 2000162988 // Layer 4, block 6 -.word 1687065733 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 21289485 // Layer 3, block 4 -.word 572895 // Layer 4, block 8 -.word 26691971 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 1362423055 // Layer 3, block 4 -.word 36662482 // Layer 4, block 8 -.word 1708155771 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 9914896 // Layer 3, block 5 -.word 9249292 // Layer 4, block 10 -.word 29292862 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 634504916 // Layer 3, block 5 -.word 591909511 // Layer 4, block 10 -.word 1874600091 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 22603682 // Layer 3, block 6 -.word 8247799 // Layer 4, block 12 -.word 5086187 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 1446525244 // Layer 3, block 6 -.word 527818851 // Layer 4, block 12 -.word 325491125 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 16204162 // Layer 3, block 7 -.word 28113639 // Layer 4, block 14 -.word 8471290 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 1036987221 // Layer 3, block 7 -.word 1799135579 // Layer 4, block 14 -.word 542121183 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.text -.global ntt_u32_incomplete_neon_asm_var_3_3_4 -.global _ntt_u32_incomplete_neon_asm_var_3_3_4 -ntt_u32_incomplete_neon_asm_var_3_3_4: -_ntt_u32_incomplete_neon_asm_var_3_3_4: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #960] -ldr q29, [x0, #832] -ldr q28, [x0, #576] -ldr q27, [x0, #704] -ldr q26, [x0, #448] -ldr q25, [x17, #+0] -ldr q24, [x17, #+16] -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -ldr q21, [x0, #320] -ldr q20, [x0, #64] -ldr q19, [x0, #192] -sqrdmulh v18.4S, v30.4S, v24.s[0] -mul v30.4S, v30.4S,v25.s[0] -sqrdmulh v17.4S, v29.4S, v24.s[0] -mla v30.4S, v18.4S, v31.s[0] -mul v29.4S, v29.4S,v25.s[0] -sqrdmulh v18.4S, v28.4S, v24.s[0] -mla v29.4S, v17.4S, v31.s[0] -ldr q17, [x0, #976] -mul v28.4S, v28.4S,v25.s[0] -sub v16.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v24.s[0] -mla v28.4S, v18.4S, v31.s[0] -ldr q18, [x0, #848] -mul v27.4S, v27.4S,v25.s[0] -sub v3.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v26.4S, v24.s[1] -mla v27.4S, v30.4S, v31.s[0] -ldr q30, [x0, #592] -mul v26.4S, v26.4S,v25.s[1] -sub v2.4s, v20.4s, v28.4s -add v20.4s, v20.4s, v28.4s -sqrdmulh v28.4S, v21.4S, v24.s[1] -mla v26.4S, v29.4S, v31.s[0] -ldr q29, [x0, #720] -mul v21.4S, v21.4S,v25.s[1] -sub v1.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v16.4S, v24.s[2] -mla v21.4S, v28.4S, v31.s[0] -ldr q28, [x0, #464] -mul v16.4S, v16.4S,v25.s[2] -sub v0.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -sqrdmulh v26.4S, v3.4S, v24.s[2] -mla v16.4S, v27.4S, v31.s[0] -ldr q27, [x0, #336] -mul v3.4S, v3.4S,v25.s[2] -sub v15.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v22.s[0] -mla v3.4S, v26.4S, v31.s[0] -ldr q26, [x0, #80] -mul v19.4S, v19.4S,v23.s[0] -sub v14.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v22.s[1] -mla v19.4S, v21.4S, v31.s[0] -ldr q21, [x0, #208] -mul v0.4S, v0.4S,v23.s[1] -sub v13.4s, v2.4s, v3.4s -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v22.s[3] -mla v0.4S, v16.4S, v31.s[0] -mul v14.4S, v14.4S,v23.s[3] -sub v16.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v22.s[2] -mla v14.4S, v3.4S, v31.s[0] -mul v1.4S, v1.4S,v23.s[2] -sub v3.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v24.s[0] -mla v1.4S, v19.4S, v31.s[0] -mul v17.4S, v17.4S,v25.s[0] -sub v19.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v24.s[0] -mla v17.4S, v0.4S, v31.s[0] -mul v18.4S, v18.4S,v25.s[0] -sub v0.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v30.4S, v24.s[0] -mla v18.4S, v14.4S, v31.s[0] -ldr q14, [x0, #992] -mul v30.4S, v30.4S,v25.s[0] -sub v12.4s, v28.4s, v17.4s -add v28.4s, v28.4s, v17.4s -sqrdmulh v17.4S, v29.4S, v24.s[0] -mla v30.4S, v1.4S, v31.s[0] -ldr q1, [x0, #864] -mul v29.4S, v29.4S,v25.s[0] -sub v11.4s, v27.4s, v18.4s -add v27.4s, v27.4s, v18.4s -sqrdmulh v18.4S, v28.4S, v24.s[1] -str q20, [x0, #64] -mla v29.4S, v17.4S, v31.s[0] -ldr q17, [x0, #608] -mul v28.4S, v28.4S,v25.s[1] -sub v20.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v24.s[1] -str q16, [x0, #192] -mla v28.4S, v18.4S, v31.s[0] -ldr q18, [x0, #736] -mul v27.4S, v27.4S,v25.s[1] -sub v16.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v12.4S, v24.s[2] -str q15, [x0, #320] -mla v27.4S, v30.4S, v31.s[0] -ldr q30, [x0, #480] -mul v12.4S, v12.4S,v25.s[2] -sub v15.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v11.4S, v24.s[2] -str q3, [x0, #448] -mla v12.4S, v29.4S, v31.s[0] -ldr q29, [x0, #352] -mul v11.4S, v11.4S,v25.s[2] -sub v3.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v21.4S, v22.s[0] -str q13, [x0, #832] -mla v11.4S, v28.4S, v31.s[0] -ldr q28, [x0, #96] -mul v21.4S, v21.4S,v23.s[0] -sub v13.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v22.s[1] -str q19, [x0, #960] -mla v21.4S, v27.4S, v31.s[0] -ldr q27, [x0, #224] -mul v15.4S, v15.4S,v23.s[1] -sub v19.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v22.s[3] -str q2, [x0, #576] -mla v15.4S, v12.4S, v31.s[0] -mul v13.4S, v13.4S,v23.s[3] -sub v12.4s, v26.4s, v21.4s -add v26.4s, v26.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v22.s[2] -str q0, [x0, #704] -mla v13.4S, v11.4S, v31.s[0] -mul v16.4S, v16.4S,v23.s[2] -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v14.4S, v24.s[0] -mla v16.4S, v21.4S, v31.s[0] -mul v14.4S, v14.4S,v25.s[0] -sub v21.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v1.4S, v24.s[0] -mla v14.4S, v15.4S, v31.s[0] -mul v1.4S, v1.4S,v25.s[0] -sub v15.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v24.s[0] -mla v1.4S, v13.4S, v31.s[0] -ldr q13, [x0, #1008] -mul v17.4S, v17.4S,v25.s[0] -sub v0.4s, v30.4s, v14.4s -add v30.4s, v30.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v24.s[0] -mla v17.4S, v16.4S, v31.s[0] -ldr q16, [x0, #880] -mul v18.4S, v18.4S,v25.s[0] -sub v2.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sqrdmulh v1.4S, v30.4S, v24.s[1] -str q26, [x0, #80] -mla v18.4S, v14.4S, v31.s[0] -ldr q14, [x0, #624] -mul v30.4S, v30.4S,v25.s[1] -sub v26.4s, v28.4s, v17.4s -add v28.4s, v28.4s, v17.4s -sqrdmulh v17.4S, v29.4S, v24.s[1] -str q12, [x0, #208] -mla v30.4S, v1.4S, v31.s[0] -ldr q1, [x0, #752] -mul v29.4S, v29.4S,v25.s[1] -sub v12.4s, v27.4s, v18.4s -add v27.4s, v27.4s, v18.4s -sqrdmulh v18.4S, v0.4S, v24.s[2] -str q3, [x0, #336] -mla v29.4S, v17.4S, v31.s[0] -ldr q17, [x0, #496] -mul v0.4S, v0.4S,v25.s[2] -sub v3.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v2.4S, v24.s[2] -str q11, [x0, #464] -mla v0.4S, v18.4S, v31.s[0] -ldr q18, [x0, #368] -mul v2.4S, v2.4S,v25.s[2] -sub v11.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v22.s[0] -str q19, [x0, #848] -mla v2.4S, v30.4S, v31.s[0] -ldr q30, [x0, #112] -mul v27.4S, v27.4S,v23.s[0] -sub v19.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v22.s[1] -str q21, [x0, #976] -mla v27.4S, v29.4S, v31.s[0] -ldr q29, [x0, #240] -mul v3.4S, v3.4S,v23.s[1] -sub v21.4s, v26.4s, v2.4s -add v26.4s, v26.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[3] -str q20, [x0, #592] -mla v3.4S, v0.4S, v31.s[0] -mul v19.4S, v19.4S,v23.s[3] -sub v0.4s, v28.4s, v27.4s -add v28.4s, v28.4s, v27.4s -sqrdmulh v27.4S, v12.4S, v22.s[2] -str q15, [x0, #720] -mla v19.4S, v2.4S, v31.s[0] -mul v12.4S, v12.4S,v23.s[2] -sub v2.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sqrdmulh v3.4S, v13.4S, v24.s[0] -mla v12.4S, v27.4S, v31.s[0] -mul v13.4S, v13.4S,v25.s[0] -sub v27.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v24.s[0] -mla v13.4S, v3.4S, v31.s[0] -mul v16.4S, v16.4S,v25.s[0] -sub v3.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v24.s[0] -mla v16.4S, v19.4S, v31.s[0] -ldr q19, [x0, #896] -mul v14.4S, v14.4S,v25.s[0] -sub v15.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v1.4S, v24.s[0] -mla v14.4S, v12.4S, v31.s[0] -ldr q12, [x0, #768] -mul v1.4S, v1.4S,v25.s[0] -sub v20.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v24.s[1] -str q28, [x0, #96] -mla v1.4S, v13.4S, v31.s[0] -ldr q13, [x0, #512] -mul v17.4S, v17.4S,v25.s[1] -sub v28.4s, v30.4s, v14.4s -add v30.4s, v30.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v24.s[1] -str q0, [x0, #224] -mla v17.4S, v16.4S, v31.s[0] -ldr q16, [x0, #640] -mul v18.4S, v18.4S,v25.s[1] -sub v0.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v24.s[2] -str q11, [x0, #352] -mla v18.4S, v14.4S, v31.s[0] -ldr q14, [x0, #384] -mul v15.4S, v15.4S,v25.s[2] -sub v11.4s, v29.4s, v17.4s -add v29.4s, v29.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v24.s[2] -str q2, [x0, #480] -mla v15.4S, v1.4S, v31.s[0] -ldr q1, [x0, #256] -mul v20.4S, v20.4S,v25.s[2] -sub v2.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sqrdmulh v18.4S, v29.4S, v22.s[0] -str q21, [x0, #864] -mla v20.4S, v17.4S, v31.s[0] -ldr q17, [x0, #0] -mul v29.4S, v29.4S,v23.s[0] -sub v21.4s, v0.4s, v15.4s -add v0.4s, v0.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v22.s[1] -str q27, [x0, #992] -mla v29.4S, v18.4S, v31.s[0] -ldr q18, [x0, #128] -mul v11.4S, v11.4S,v23.s[1] -sub v27.4s, v28.4s, v20.4s -add v28.4s, v28.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v22.s[3] -str q26, [x0, #608] -mla v11.4S, v15.4S, v31.s[0] -mul v21.4S, v21.4S,v23.s[3] -sub v15.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v22.s[2] -str q3, [x0, #736] -mla v21.4S, v20.4S, v31.s[0] -mul v0.4S, v0.4S,v23.s[2] -sub v20.4s, v2.4s, v11.4s -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v19.4S, v24.s[0] -mla v0.4S, v29.4S, v31.s[0] -mul v19.4S, v19.4S,v25.s[0] -sub v29.4s, v27.4s, v21.4s -add v27.4s, v27.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v24.s[0] -mla v19.4S, v11.4S, v31.s[0] -mul v12.4S, v12.4S,v25.s[0] -sub v11.4s, v28.4s, v0.4s -add v28.4s, v28.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v24.s[0] -mla v12.4S, v21.4S, v31.s[0] -ldr q21, [x0, #912] -mul v13.4S, v13.4S,v25.s[0] -sub v3.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v24.s[0] -mla v13.4S, v0.4S, v31.s[0] -ldr q0, [x0, #784] -mul v16.4S, v16.4S,v25.s[0] -sub v26.4s, v1.4s, v12.4s -add v1.4s, v1.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v24.s[1] -str q30, [x0, #112] -mla v16.4S, v19.4S, v31.s[0] -ldr q19, [x0, #528] -mul v14.4S, v14.4S,v25.s[1] -sub v30.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v1.4S, v24.s[1] -str q15, [x0, #240] -mla v14.4S, v12.4S, v31.s[0] -ldr q12, [x0, #656] -mul v1.4S, v1.4S,v25.s[1] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v24.s[2] -str q2, [x0, #368] -mla v1.4S, v13.4S, v31.s[0] -ldr q13, [x0, #400] -mul v3.4S, v3.4S,v25.s[2] -sub v2.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v24.s[2] -str q20, [x0, #496] -mla v3.4S, v16.4S, v31.s[0] -ldr q16, [x0, #272] -mul v26.4S, v26.4S,v25.s[2] -sub v20.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v22.s[0] -str q27, [x0, #880] -mla v26.4S, v14.4S, v31.s[0] -ldr q14, [x0, #16] -mul v18.4S, v18.4S,v23.s[0] -sub v27.4s, v15.4s, v3.4s -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v22.s[1] -str q29, [x0, #1008] -mla v18.4S, v1.4S, v31.s[0] -ldr q1, [x0, #144] -mul v2.4S, v2.4S,v23.s[1] -sub v29.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v22.s[3] -str q28, [x0, #624] -mla v2.4S, v3.4S, v31.s[0] -mul v27.4S, v27.4S,v23.s[3] -sub v3.4s, v17.4s, v18.4s -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v22.s[2] -str q11, [x0, #752] -mla v27.4S, v26.4S, v31.s[0] -mul v15.4S, v15.4S,v23.s[2] -sub v26.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v21.4S, v24.s[0] -mla v15.4S, v18.4S, v31.s[0] -mul v21.4S, v21.4S,v25.s[0] -sub v18.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v24.s[0] -mla v21.4S, v2.4S, v31.s[0] -mul v0.4S, v0.4S,v25.s[0] -sub v2.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v24.s[0] -mla v0.4S, v27.4S, v31.s[0] -ldr q27, [x0, #928] -mul v19.4S, v19.4S,v25.s[0] -sub v11.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v24.s[0] -mla v19.4S, v15.4S, v31.s[0] -ldr q15, [x0, #800] -mul v12.4S, v12.4S,v25.s[0] -sub v28.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v24.s[1] -str q17, [x0, #0] -mla v12.4S, v21.4S, v31.s[0] -ldr q21, [x0, #544] -mul v13.4S, v13.4S,v25.s[1] -sub v17.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v24.s[1] -str q3, [x0, #128] -mla v13.4S, v0.4S, v31.s[0] -ldr q0, [x0, #672] -mul v16.4S, v16.4S,v25.s[1] -sub v3.4s, v1.4s, v12.4s -add v1.4s, v1.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v24.s[2] -str q20, [x0, #256] -mla v16.4S, v19.4S, v31.s[0] -ldr q19, [x0, #416] -mul v11.4S, v11.4S,v25.s[2] -sub v20.4s, v1.4s, v13.4s -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v28.4S, v24.s[2] -str q26, [x0, #384] -mla v11.4S, v12.4S, v31.s[0] -ldr q12, [x0, #288] -mul v28.4S, v28.4S,v25.s[2] -sub v26.4s, v14.4s, v16.4s -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v22.s[0] -str q29, [x0, #768] -mla v28.4S, v13.4S, v31.s[0] -ldr q13, [x0, #32] -mul v1.4S, v1.4S,v23.s[0] -sub v29.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sqrdmulh v11.4S, v20.4S, v22.s[1] -str q18, [x0, #896] -mla v1.4S, v16.4S, v31.s[0] -ldr q16, [x0, #160] -mul v20.4S, v20.4S,v23.s[1] -sub v18.4s, v17.4s, v28.4s -add v17.4s, v17.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v22.s[3] -str q30, [x0, #512] -mla v20.4S, v11.4S, v31.s[0] -mul v29.4S, v29.4S,v23.s[3] -sub v11.4s, v14.4s, v1.4s -add v14.4s, v14.4s, v1.4s -sqrdmulh v1.4S, v3.4S, v22.s[2] -str q2, [x0, #640] -mla v29.4S, v28.4S, v31.s[0] -mul v3.4S, v3.4S,v23.s[2] -sub v28.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v27.4S, v24.s[0] -mla v3.4S, v1.4S, v31.s[0] -mul v27.4S, v27.4S,v25.s[0] -sub v1.4s, v18.4s, v29.4s -add v18.4s, v18.4s, v29.4s -sqrdmulh v29.4S, v15.4S, v24.s[0] -mla v27.4S, v20.4S, v31.s[0] -mul v15.4S, v15.4S,v25.s[0] -sub v20.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v24.s[0] -mla v15.4S, v29.4S, v31.s[0] -ldr q29, [x0, #944] -mul v21.4S, v21.4S,v25.s[0] -sub v2.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v24.s[0] -mla v21.4S, v3.4S, v31.s[0] -ldr q3, [x0, #816] -mul v0.4S, v0.4S,v25.s[0] -sub v30.4s, v12.4s, v15.4s -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v24.s[1] -str q14, [x0, #16] -mla v0.4S, v27.4S, v31.s[0] -ldr q27, [x0, #560] -mul v19.4S, v19.4S,v25.s[1] -sub v14.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v24.s[1] -str q11, [x0, #144] -mla v19.4S, v15.4S, v31.s[0] -ldr q15, [x0, #688] -mul v12.4S, v12.4S,v25.s[1] -sub v11.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v24.s[2] -str q26, [x0, #272] -mla v12.4S, v21.4S, v31.s[0] -ldr q21, [x0, #432] -mul v2.4S, v2.4S,v25.s[2] -sub v26.4s, v16.4s, v19.4s -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v24.s[2] -str q28, [x0, #400] -mla v2.4S, v0.4S, v31.s[0] -ldr q0, [x0, #304] -mul v30.4S, v30.4S,v25.s[2] -sub v28.4s, v13.4s, v12.4s -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v22.s[0] -str q18, [x0, #784] -mla v30.4S, v19.4S, v31.s[0] -ldr q19, [x0, #48] -mul v16.4S, v16.4S,v23.s[0] -sub v18.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v26.4S, v22.s[1] -str q1, [x0, #912] -mla v16.4S, v12.4S, v31.s[0] -ldr q12, [x0, #176] -mul v26.4S, v26.4S,v23.s[1] -sub v1.4s, v14.4s, v30.4s -add v14.4s, v14.4s, v30.4s -sqrdmulh v30.4S, v18.4S, v22.s[3] -str q17, [x0, #528] -mla v26.4S, v2.4S, v31.s[0] -mul v18.4S, v18.4S,v23.s[3] -sub v2.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v11.4S, v22.s[2] -str q20, [x0, #656] -mla v18.4S, v30.4S, v31.s[0] -mul v11.4S, v11.4S,v23.s[2] -sub v30.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v29.4S, v24.s[0] -mla v11.4S, v16.4S, v31.s[0] -mul v29.4S, v29.4S,v25.s[0] -sub v16.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v24.s[0] -mla v29.4S, v26.4S, v31.s[0] -mul v3.4S, v3.4S,v25.s[0] -sub v26.4s, v14.4s, v11.4s -add v14.4s, v14.4s, v11.4s -sqrdmulh v11.4S, v27.4S, v24.s[0] -mla v3.4S, v18.4S, v31.s[0] -mul v27.4S, v27.4S,v25.s[0] -sub v18.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v15.4S, v24.s[0] -mla v27.4S, v11.4S, v31.s[0] -mul v15.4S, v15.4S,v25.s[0] -sub v11.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v24.s[1] -str q13, [x0, #32] -mla v15.4S, v29.4S, v31.s[0] -mul v21.4S, v21.4S,v25.s[1] -sub v29.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v24.s[1] -str q2, [x0, #160] -mla v21.4S, v3.4S, v31.s[0] -mul v0.4S, v0.4S,v25.s[1] -sub v3.4s, v12.4s, v15.4s -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v24.s[2] -str q28, [x0, #288] -mla v0.4S, v27.4S, v31.s[0] -mul v18.4S, v18.4S,v25.s[2] -sub v27.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v24.s[2] -str q30, [x0, #416] -mla v18.4S, v15.4S, v31.s[0] -mul v11.4S, v11.4S,v25.s[2] -sub v15.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v22.s[0] -str q1, [x0, #800] -mla v11.4S, v21.4S, v31.s[0] -mul v12.4S, v12.4S,v23.s[0] -sub v21.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v27.4S, v22.s[1] -str q16, [x0, #928] -mla v12.4S, v0.4S, v31.s[0] -mul v27.4S, v27.4S,v23.s[1] -sub v0.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v22.s[3] -str q14, [x0, #544] -mla v27.4S, v18.4S, v31.s[0] -mul v21.4S, v21.4S,v23.s[3] -sub v18.4s, v19.4s, v12.4s -add v19.4s, v19.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v22.s[2] -str q26, [x0, #672] -mla v21.4S, v11.4S, v31.s[0] -mul v3.4S, v3.4S,v23.s[2] -sub v11.4s, v15.4s, v27.4s -add v15.4s, v15.4s, v27.4s -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sub v21.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -str q19, [x0, #48] -str q18, [x0, #176] -str q15, [x0, #304] -str q11, [x0, #432] -str q0, [x0, #816] -str q12, [x0, #944] -str q29, [x0, #560] -str q21, [x0, #688] -ldr q4, [x0, #112] -ldr q5, [x0, #96] -ldr q6, [x0, #64] -ldr q7, [x0, #80] -ldr q8, [x0, #48] -ldr q9, [x17, #+64] -ldr q10, [x17, #+80] -ldr q17, [x17, #+96] -ldr q20, [x17, #+112] -ldr q13, [x0, #32] -ldr q2, [x0, #0] -ldr q28, [x0, #16] -sqrdmulh v30.4S, v4.4S, v10.s[0] -mul v4.4S, v4.4S,v9.s[0] -sqrdmulh v1.4S, v5.4S, v10.s[0] -mla v4.4S, v30.4S, v31.s[0] -mul v5.4S, v5.4S,v9.s[0] -sqrdmulh v30.4S, v6.4S, v10.s[0] -mla v5.4S, v1.4S, v31.s[0] -ldr q1, [x0, #240] -mul v6.4S, v6.4S,v9.s[0] -sub v16.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v10.s[0] -mla v6.4S, v30.4S, v31.s[0] -ldr q30, [x0, #224] -mul v7.4S, v7.4S,v9.s[0] -sub v14.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v10.s[1] -mla v7.4S, v4.4S, v31.s[0] -ldr q4, [x0, #192] -mul v8.4S, v8.4S,v9.s[1] -sub v26.4s, v2.4s, v6.4s -add v2.4s, v2.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v10.s[1] -mla v8.4S, v5.4S, v31.s[0] -ldr q5, [x0, #208] -mul v13.4S, v13.4S,v9.s[1] -sub v27.4s, v28.4s, v7.4s -add v28.4s, v28.4s, v7.4s -sqrdmulh v7.4S, v16.4S, v10.s[2] -mla v13.4S, v6.4S, v31.s[0] -ldr q6, [x0, #176] -mul v16.4S, v16.4S,v9.s[2] -sub v3.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -ldr q8, [x17, #+128] -ldr q25, [x17, #+144] -ldr q24, [x17, #+160] -ldr q23, [x17, #+176] -sqrdmulh v22.4S, v14.4S, v10.s[2] -mla v16.4S, v7.4S, v31.s[0] -ldr q7, [x0, #160] -mul v14.4S, v14.4S,v9.s[2] -sub v19.4s, v2.4s, v13.4s -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v28.4S, v20.s[0] -mla v14.4S, v22.4S, v31.s[0] -ldr q22, [x0, #128] -mul v28.4S, v28.4S,v17.s[0] -sub v18.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v20.s[1] -mla v28.4S, v13.4S, v31.s[0] -ldr q13, [x0, #144] -mul v3.4S, v3.4S,v17.s[1] -sub v15.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v20.s[3] -mla v3.4S, v16.4S, v31.s[0] -mul v18.4S, v18.4S,v17.s[3] -sub v16.4s, v2.4s, v28.4s -add v2.4s, v2.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v20.s[2] -mla v18.4S, v14.4S, v31.s[0] -mul v27.4S, v27.4S,v17.s[2] -sub v14.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v25.s[0] -mla v27.4S, v28.4S, v31.s[0] -mul v1.4S, v1.4S,v8.s[0] -sub v28.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v25.s[0] -mla v1.4S, v3.4S, v31.s[0] -mul v30.4S, v30.4S,v8.s[0] -sub v3.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v20.4S, v4.4S, v25.s[0] -mla v30.4S, v18.4S, v31.s[0] -ldr q18, [x0, #368] -mul v4.4S, v4.4S,v8.s[0] -sub v17.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v25.s[0] -mla v4.4S, v20.4S, v31.s[0] -ldr q20, [x0, #352] -mul v5.4S, v5.4S,v8.s[0] -sub v10.4s, v7.4s, v30.4s -add v7.4s, v7.4s, v30.4s -sqrdmulh v30.4S, v6.4S, v25.s[1] -str q2, [x0, #0] -mla v5.4S, v1.4S, v31.s[0] -ldr q1, [x0, #320] -mul v6.4S, v6.4S,v8.s[1] -sub v2.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v25.s[1] -str q16, [x0, #16] -mla v6.4S, v30.4S, v31.s[0] -ldr q30, [x0, #336] -mul v7.4S, v7.4S,v8.s[1] -sub v16.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -sqrdmulh v5.4S, v17.4S, v25.s[2] -str q19, [x0, #32] -mla v7.4S, v4.4S, v31.s[0] -ldr q4, [x0, #304] -mul v17.4S, v17.4S,v8.s[2] -sub v19.4s, v13.4s, v6.4s -add v13.4s, v13.4s, v6.4s -ldr q6, [x17, #+192] -ldr q9, [x17, #+208] -ldr q27, [x17, #+224] -ldr q11, [x17, #+240] -sqrdmulh v0.4S, v10.4S, v25.s[2] -str q14, [x0, #48] -mla v17.4S, v5.4S, v31.s[0] -ldr q5, [x0, #288] -mul v10.4S, v10.4S,v8.s[2] -sub v14.4s, v22.4s, v7.4s -add v22.4s, v22.4s, v7.4s -sqrdmulh v7.4S, v13.4S, v23.s[0] -str q15, [x0, #96] -mla v10.4S, v0.4S, v31.s[0] -ldr q0, [x0, #256] -mul v13.4S, v13.4S,v24.s[0] -sub v15.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v23.s[1] -str q28, [x0, #112] -mla v13.4S, v7.4S, v31.s[0] -ldr q7, [x0, #272] -mul v19.4S, v19.4S,v24.s[1] -sub v28.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v23.s[3] -str q26, [x0, #64] -mla v19.4S, v17.4S, v31.s[0] -mul v15.4S, v15.4S,v24.s[3] -sub v17.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v23.s[2] -str q3, [x0, #80] -mla v15.4S, v10.4S, v31.s[0] -mul v16.4S, v16.4S,v24.s[2] -sub v10.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v9.s[0] -mla v16.4S, v13.4S, v31.s[0] -mul v18.4S, v18.4S,v6.s[0] -sub v13.4s, v28.4s, v15.4s -add v28.4s, v28.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v9.s[0] -mla v18.4S, v19.4S, v31.s[0] -mul v20.4S, v20.4S,v6.s[0] -sub v19.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v23.4S, v1.4S, v9.s[0] -mla v20.4S, v15.4S, v31.s[0] -ldr q15, [x0, #496] -mul v1.4S, v1.4S,v6.s[0] -sub v24.4s, v4.4s, v18.4s -add v4.4s, v4.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v9.s[0] -mla v1.4S, v23.4S, v31.s[0] -ldr q23, [x0, #480] -mul v30.4S, v30.4S,v6.s[0] -sub v25.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v4.4S, v9.s[1] -str q22, [x0, #128] -mla v30.4S, v18.4S, v31.s[0] -ldr q18, [x0, #448] -mul v4.4S, v4.4S,v6.s[1] -sub v22.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v9.s[1] -str q17, [x0, #144] -mla v4.4S, v20.4S, v31.s[0] -ldr q20, [x0, #464] -mul v5.4S, v5.4S,v6.s[1] -sub v17.4s, v7.4s, v30.4s -add v7.4s, v7.4s, v30.4s -sqrdmulh v30.4S, v24.4S, v9.s[2] -str q14, [x0, #160] -mla v5.4S, v1.4S, v31.s[0] -ldr q1, [x0, #432] -mul v24.4S, v24.4S,v6.s[2] -sub v14.4s, v7.4s, v4.4s -add v7.4s, v7.4s, v4.4s -ldr q4, [x17, #+256] -ldr q8, [x17, #+272] -ldr q16, [x17, #+288] -ldr q3, [x17, #+304] -sqrdmulh v26.4S, v25.4S, v9.s[2] -str q10, [x0, #176] -mla v24.4S, v30.4S, v31.s[0] -ldr q30, [x0, #416] -mul v25.4S, v25.4S,v6.s[2] -sub v10.4s, v0.4s, v5.4s -add v0.4s, v0.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v11.s[0] -str q28, [x0, #224] -mla v25.4S, v26.4S, v31.s[0] -ldr q26, [x0, #384] -mul v7.4S, v7.4S,v27.s[0] -sub v28.4s, v17.4s, v24.4s -add v17.4s, v17.4s, v24.4s -sqrdmulh v24.4S, v14.4S, v11.s[1] -str q13, [x0, #240] -mla v7.4S, v5.4S, v31.s[0] -ldr q5, [x0, #400] -mul v14.4S, v14.4S,v27.s[1] -sub v13.4s, v22.4s, v25.4s -add v22.4s, v22.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v11.s[3] -str q2, [x0, #192] -mla v14.4S, v24.4S, v31.s[0] -mul v28.4S, v28.4S,v27.s[3] -sub v24.4s, v0.4s, v7.4s -add v0.4s, v0.4s, v7.4s -sqrdmulh v7.4S, v17.4S, v11.s[2] -str q19, [x0, #208] -mla v28.4S, v25.4S, v31.s[0] -mul v17.4S, v17.4S,v27.s[2] -sub v25.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v8.s[0] -mla v17.4S, v7.4S, v31.s[0] -mul v15.4S, v15.4S,v4.s[0] -sub v7.4s, v13.4s, v28.4s -add v13.4s, v13.4s, v28.4s -sqrdmulh v28.4S, v23.4S, v8.s[0] -mla v15.4S, v14.4S, v31.s[0] -mul v23.4S, v23.4S,v4.s[0] -sub v14.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v11.4S, v18.4S, v8.s[0] -mla v23.4S, v28.4S, v31.s[0] -ldr q28, [x0, #624] -mul v18.4S, v18.4S,v4.s[0] -sub v27.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v8.s[0] -mla v18.4S, v11.4S, v31.s[0] -ldr q11, [x0, #608] -mul v20.4S, v20.4S,v4.s[0] -sub v9.4s, v30.4s, v23.4s -add v30.4s, v30.4s, v23.4s -sqrdmulh v23.4S, v1.4S, v8.s[1] -str q0, [x0, #256] -mla v20.4S, v15.4S, v31.s[0] -ldr q15, [x0, #576] -mul v1.4S, v1.4S,v4.s[1] -sub v0.4s, v26.4s, v18.4s -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v8.s[1] -str q24, [x0, #272] -mla v1.4S, v23.4S, v31.s[0] -ldr q23, [x0, #592] -mul v30.4S, v30.4S,v4.s[1] -sub v24.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v27.4S, v8.s[2] -str q10, [x0, #288] -mla v30.4S, v18.4S, v31.s[0] -ldr q18, [x0, #560] -mul v27.4S, v27.4S,v4.s[2] -sub v10.4s, v5.4s, v1.4s -add v5.4s, v5.4s, v1.4s -ldr q1, [x17, #+320] -ldr q6, [x17, #+336] -ldr q17, [x17, #+352] -ldr q19, [x17, #+368] -sqrdmulh v2.4S, v9.4S, v8.s[2] -str q25, [x0, #304] -mla v27.4S, v20.4S, v31.s[0] -ldr q20, [x0, #544] -mul v9.4S, v9.4S,v4.s[2] -sub v25.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v5.4S, v3.s[0] -str q13, [x0, #352] -mla v9.4S, v2.4S, v31.s[0] -ldr q2, [x0, #512] -mul v5.4S, v5.4S,v16.s[0] -sub v13.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -sqrdmulh v27.4S, v10.4S, v3.s[1] -str q7, [x0, #368] -mla v5.4S, v30.4S, v31.s[0] -ldr q30, [x0, #528] -mul v10.4S, v10.4S,v16.s[1] -sub v7.4s, v0.4s, v9.4s -add v0.4s, v0.4s, v9.4s -sqrdmulh v9.4S, v13.4S, v3.s[3] -str q22, [x0, #320] -mla v10.4S, v27.4S, v31.s[0] -mul v13.4S, v13.4S,v16.s[3] -sub v27.4s, v26.4s, v5.4s -add v26.4s, v26.4s, v5.4s -sqrdmulh v5.4S, v24.4S, v3.s[2] -str q14, [x0, #336] -mla v13.4S, v9.4S, v31.s[0] -mul v24.4S, v24.4S,v16.s[2] -sub v9.4s, v25.4s, v10.4s -add v25.4s, v25.4s, v10.4s -sqrdmulh v10.4S, v28.4S, v6.s[0] -mla v24.4S, v5.4S, v31.s[0] -mul v28.4S, v28.4S,v1.s[0] -sub v5.4s, v7.4s, v13.4s -add v7.4s, v7.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v6.s[0] -mla v28.4S, v10.4S, v31.s[0] -mul v11.4S, v11.4S,v1.s[0] -sub v10.4s, v0.4s, v24.4s -add v0.4s, v0.4s, v24.4s -sqrdmulh v3.4S, v15.4S, v6.s[0] -mla v11.4S, v13.4S, v31.s[0] -ldr q13, [x0, #752] -mul v15.4S, v15.4S,v1.s[0] -sub v16.4s, v18.4s, v28.4s -add v18.4s, v18.4s, v28.4s -sqrdmulh v28.4S, v23.4S, v6.s[0] -mla v15.4S, v3.4S, v31.s[0] -ldr q3, [x0, #736] -mul v23.4S, v23.4S,v1.s[0] -sub v8.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v6.s[1] -str q26, [x0, #384] -mla v23.4S, v28.4S, v31.s[0] -ldr q28, [x0, #704] -mul v18.4S, v18.4S,v1.s[1] -sub v26.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v6.s[1] -str q27, [x0, #400] -mla v18.4S, v11.4S, v31.s[0] -ldr q11, [x0, #720] -mul v20.4S, v20.4S,v1.s[1] -sub v27.4s, v30.4s, v23.4s -add v30.4s, v30.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v6.s[2] -str q25, [x0, #416] -mla v20.4S, v15.4S, v31.s[0] -ldr q15, [x0, #688] -mul v16.4S, v16.4S,v1.s[2] -sub v25.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -ldr q18, [x17, #+384] -ldr q4, [x17, #+400] -ldr q24, [x17, #+416] -ldr q14, [x17, #+432] -sqrdmulh v22.4S, v8.4S, v6.s[2] -str q9, [x0, #432] -mla v16.4S, v23.4S, v31.s[0] -ldr q23, [x0, #672] -mul v8.4S, v8.4S,v1.s[2] -sub v9.4s, v2.4s, v20.4s -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v19.s[0] -str q7, [x0, #480] -mla v8.4S, v22.4S, v31.s[0] -ldr q22, [x0, #640] -mul v30.4S, v30.4S,v17.s[0] -sub v7.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v19.s[1] -str q5, [x0, #496] -mla v30.4S, v20.4S, v31.s[0] -ldr q20, [x0, #656] -mul v25.4S, v25.4S,v17.s[1] -sub v5.4s, v26.4s, v8.4s -add v26.4s, v26.4s, v8.4s -sqrdmulh v8.4S, v7.4S, v19.s[3] -str q0, [x0, #448] -mla v25.4S, v16.4S, v31.s[0] -mul v7.4S, v7.4S,v17.s[3] -sub v16.4s, v2.4s, v30.4s -add v2.4s, v2.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v19.s[2] -str q10, [x0, #464] -mla v7.4S, v8.4S, v31.s[0] -mul v27.4S, v27.4S,v17.s[2] -sub v8.4s, v9.4s, v25.4s -add v9.4s, v9.4s, v25.4s -sqrdmulh v25.4S, v13.4S, v4.s[0] -mla v27.4S, v30.4S, v31.s[0] -mul v13.4S, v13.4S,v18.s[0] -sub v30.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v3.4S, v4.s[0] -mla v13.4S, v25.4S, v31.s[0] -mul v3.4S, v3.4S,v18.s[0] -sub v25.4s, v26.4s, v27.4s -add v26.4s, v26.4s, v27.4s -sqrdmulh v19.4S, v28.4S, v4.s[0] -mla v3.4S, v7.4S, v31.s[0] -ldr q7, [x0, #880] -mul v28.4S, v28.4S,v18.s[0] -sub v17.4s, v15.4s, v13.4s -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v4.s[0] -mla v28.4S, v19.4S, v31.s[0] -ldr q19, [x0, #864] -mul v11.4S, v11.4S,v18.s[0] -sub v6.4s, v23.4s, v3.4s -add v23.4s, v23.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v4.s[1] -str q2, [x0, #512] -mla v11.4S, v13.4S, v31.s[0] -ldr q13, [x0, #832] -mul v15.4S, v15.4S,v18.s[1] -sub v2.4s, v22.4s, v28.4s -add v22.4s, v22.4s, v28.4s -sqrdmulh v28.4S, v23.4S, v4.s[1] -str q16, [x0, #528] -mla v15.4S, v3.4S, v31.s[0] -ldr q3, [x0, #848] -mul v23.4S, v23.4S,v18.s[1] -sub v16.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v4.s[2] -str q9, [x0, #544] -mla v23.4S, v28.4S, v31.s[0] -ldr q28, [x0, #816] -mul v17.4S, v17.4S,v18.s[2] -sub v9.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -ldr q15, [x17, #+448] -ldr q1, [x17, #+464] -ldr q27, [x17, #+480] -ldr q10, [x17, #+496] -sqrdmulh v0.4S, v6.4S, v4.s[2] -str q8, [x0, #560] -mla v17.4S, v11.4S, v31.s[0] -ldr q11, [x0, #800] -mul v6.4S, v6.4S,v18.s[2] -sub v8.4s, v22.4s, v23.4s -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v20.4S, v14.s[0] -str q5, [x0, #608] -mla v6.4S, v0.4S, v31.s[0] -ldr q0, [x0, #768] -mul v20.4S, v20.4S,v24.s[0] -sub v5.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v9.4S, v14.s[1] -str q30, [x0, #624] -mla v20.4S, v23.4S, v31.s[0] -ldr q23, [x0, #784] -mul v9.4S, v9.4S,v24.s[1] -sub v30.4s, v2.4s, v6.4s -add v2.4s, v2.4s, v6.4s -sqrdmulh v6.4S, v5.4S, v14.s[3] -str q26, [x0, #576] -mla v9.4S, v17.4S, v31.s[0] -mul v5.4S, v5.4S,v24.s[3] -sub v17.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v14.s[2] -str q25, [x0, #592] -mla v5.4S, v6.4S, v31.s[0] -mul v16.4S, v16.4S,v24.s[2] -sub v6.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v1.s[0] -mla v16.4S, v20.4S, v31.s[0] -mul v7.4S, v7.4S,v15.s[0] -sub v20.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v19.4S, v1.s[0] -mla v7.4S, v9.4S, v31.s[0] -mul v19.4S, v19.4S,v15.s[0] -sub v9.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v14.4S, v13.4S, v1.s[0] -mla v19.4S, v5.4S, v31.s[0] -ldr q5, [x0, #1008] -mul v13.4S, v13.4S,v15.s[0] -sub v24.4s, v28.4s, v7.4s -add v28.4s, v28.4s, v7.4s -sqrdmulh v7.4S, v3.4S, v1.s[0] -mla v13.4S, v14.4S, v31.s[0] -ldr q14, [x0, #992] -mul v3.4S, v3.4S,v15.s[0] -sub v4.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v1.s[1] -str q22, [x0, #640] -mla v3.4S, v7.4S, v31.s[0] -ldr q7, [x0, #960] -mul v28.4S, v28.4S,v15.s[1] -sub v22.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v1.s[1] -str q17, [x0, #656] -mla v28.4S, v19.4S, v31.s[0] -ldr q19, [x0, #976] -mul v11.4S, v11.4S,v15.s[1] -sub v17.4s, v23.4s, v3.4s -add v23.4s, v23.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v1.s[2] -str q8, [x0, #672] -mla v11.4S, v13.4S, v31.s[0] -ldr q13, [x0, #944] -mul v24.4S, v24.4S,v15.s[2] -sub v8.4s, v23.4s, v28.4s -add v23.4s, v23.4s, v28.4s -ldr q28, [x17, #+512] -ldr q18, [x17, #+528] -ldr q16, [x17, #+544] -ldr q25, [x17, #+560] -sqrdmulh v26.4S, v4.4S, v1.s[2] -str q6, [x0, #688] -mla v24.4S, v3.4S, v31.s[0] -ldr q3, [x0, #928] -mul v4.4S, v4.4S,v15.s[2] -sub v6.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v23.4S, v10.s[0] -str q30, [x0, #736] -mla v4.4S, v26.4S, v31.s[0] -ldr q26, [x0, #896] -mul v23.4S, v23.4S,v27.s[0] -sub v30.4s, v17.4s, v24.4s -add v17.4s, v17.4s, v24.4s -sqrdmulh v24.4S, v8.4S, v10.s[1] -str q20, [x0, #752] -mla v23.4S, v11.4S, v31.s[0] -ldr q11, [x0, #912] -mul v8.4S, v8.4S,v27.s[1] -sub v20.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v10.s[3] -str q2, [x0, #704] -mla v8.4S, v24.4S, v31.s[0] -mul v30.4S, v30.4S,v27.s[3] -sub v24.4s, v0.4s, v23.4s -add v0.4s, v0.4s, v23.4s -sqrdmulh v23.4S, v17.4S, v10.s[2] -str q9, [x0, #720] -mla v30.4S, v4.4S, v31.s[0] -mul v17.4S, v17.4S,v27.s[2] -sub v4.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v18.s[0] -mla v17.4S, v23.4S, v31.s[0] -mul v5.4S, v5.4S,v28.s[0] -sub v23.4s, v20.4s, v30.4s -add v20.4s, v20.4s, v30.4s -sqrdmulh v30.4S, v14.4S, v18.s[0] -mla v5.4S, v8.4S, v31.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v8.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v10.4S, v7.4S, v18.s[0] -mla v14.4S, v30.4S, v31.s[0] -mul v7.4S, v7.4S,v28.s[0] -sub v30.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -sqrdmulh v5.4S, v19.4S, v18.s[0] -mla v7.4S, v10.4S, v31.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v10.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v13.4S, v18.s[1] -str q0, [x0, #768] -mla v19.4S, v5.4S, v31.s[0] -mul v13.4S, v13.4S,v28.s[1] -sub v5.4s, v26.4s, v7.4s -add v26.4s, v26.4s, v7.4s -sqrdmulh v7.4S, v3.4S, v18.s[1] -str q24, [x0, #784] -mla v13.4S, v14.4S, v31.s[0] -mul v3.4S, v3.4S,v28.s[1] -sub v14.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -sqrdmulh v19.4S, v30.4S, v18.s[2] -str q6, [x0, #800] -mla v3.4S, v7.4S, v31.s[0] -mul v30.4S, v30.4S,v28.s[2] -sub v7.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v18.s[2] -str q4, [x0, #816] -mla v30.4S, v19.4S, v31.s[0] -mul v10.4S, v10.4S,v28.s[2] -sub v19.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[0] -str q20, [x0, #864] -mla v10.4S, v13.4S, v31.s[0] -mul v11.4S, v11.4S,v16.s[0] -sub v13.4s, v14.4s, v30.4s -add v14.4s, v14.4s, v30.4s -sqrdmulh v30.4S, v7.4S, v25.s[1] -str q23, [x0, #880] -mla v11.4S, v3.4S, v31.s[0] -mul v7.4S, v7.4S,v16.s[1] -sub v3.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v13.4S, v25.s[3] -str q22, [x0, #832] -mla v7.4S, v30.4S, v31.s[0] -mul v13.4S, v13.4S,v16.s[3] -sub v30.4s, v26.4s, v11.4s -add v26.4s, v26.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v25.s[2] -str q8, [x0, #848] -mla v13.4S, v10.4S, v31.s[0] -mul v14.4S, v14.4S,v16.s[2] -sub v10.4s, v19.4s, v7.4s -add v19.4s, v19.4s, v7.4s -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v3.4s, v13.4s -add v3.4s, v3.4s, v13.4s -sub v13.4s, v5.4s, v14.4s -add v5.4s, v5.4s, v14.4s -str q26, [x0, #896] -str q30, [x0, #912] -str q19, [x0, #928] -str q10, [x0, #944] -str q3, [x0, #992] -str q11, [x0, #1008] -str q5, [x0, #960] -str q13, [x0, #976] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1444 -// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s deleted file mode 100644 index 2225ee5..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_3_3_5.s +++ /dev/null @@ -1,1474 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 23825509 // Layer 4, block 0 -.word 27028662 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 1307297022 // Layer 3, block 0 -.word 1524716204 // Layer 4, block 0 -.word 1729702351 // Layer 4, block 1 -.word 0 // Layer None, block None -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 14626653 // Layer 3, block 1 -.word 14833295 // Layer 4, block 2 -.word 2138810 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 936034350 // Layer 3, block 1 -.word 949258429 // Layer 4, block 2 -.word 136873393 // Layer 4, block 3 -.word 0 // Layer None, block None -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 29737761 // Layer 3, block 2 -.word 6490403 // Layer 4, block 4 -.word 19648405 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 1903071454 // Layer 3, block 2 -.word 415354091 // Layer 4, block 4 -.word 1257401950 // Layer 4, block 5 -.word 0 // Layer None, block None -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 30285189 // Layer 3, block 3 -.word 31254932 // Layer 4, block 6 -.word 26362414 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 1938104173 // Layer 3, block 3 -.word 2000162988 // Layer 4, block 6 -.word 1687065733 // Layer 4, block 7 -.word 0 // Layer None, block None -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 21289485 // Layer 3, block 4 -.word 572895 // Layer 4, block 8 -.word 26691971 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 1362423055 // Layer 3, block 4 -.word 36662482 // Layer 4, block 8 -.word 1708155771 // Layer 4, block 9 -.word 0 // Layer None, block None -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 9914896 // Layer 3, block 5 -.word 9249292 // Layer 4, block 10 -.word 29292862 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 634504916 // Layer 3, block 5 -.word 591909511 // Layer 4, block 10 -.word 1874600091 // Layer 4, block 11 -.word 0 // Layer None, block None -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 22603682 // Layer 3, block 6 -.word 8247799 // Layer 4, block 12 -.word 5086187 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 1446525244 // Layer 3, block 6 -.word 527818851 // Layer 4, block 12 -.word 325491125 // Layer 4, block 13 -.word 0 // Layer None, block None -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 16204162 // Layer 3, block 7 -.word 28113639 // Layer 4, block 14 -.word 8471290 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 1036987221 // Layer 3, block 7 -.word 1799135579 // Layer 4, block 14 -.word 542121183 // Layer 4, block 15 -.word 0 // Layer None, block None -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.text -.global ntt_u32_incomplete_neon_asm_var_3_3_5 -.global _ntt_u32_incomplete_neon_asm_var_3_3_5 -ntt_u32_incomplete_neon_asm_var_3_3_5: -_ntt_u32_incomplete_neon_asm_var_3_3_5: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #960] -ldr q29, [x0, #832] -ldr q28, [x0, #576] -ldr q27, [x0, #704] -ldr q26, [x0, #448] -ldr q25, [x17, #+0] -ldr q24, [x17, #+16] -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -ldr q21, [x0, #320] -ldr q20, [x0, #64] -ldr q19, [x0, #192] -sqrdmulh v18.4S, v30.4S, v24.s[0] -sqrdmulh v17.4S, v29.4S, v24.s[0] -mul v30.4S, v30.4S,v25.s[0] -mla v30.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v28.4S, v24.s[0] -ldr q16, [x0, #976] -mul v29.4S, v29.4S,v25.s[0] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v24.s[0] -ldr q3, [x0, #848] -mul v28.4S, v28.4S,v25.s[0] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v29.4s -add v21.4s, v21.4s, v29.4s -sqrdmulh v29.4S, v26.4S, v24.s[1] -ldr q2, [x0, #592] -mul v27.4S, v27.4S,v25.s[0] -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v20.4s, v28.4s -add v20.4s, v20.4s, v28.4s -sqrdmulh v28.4S, v21.4S, v24.s[1] -ldr q1, [x0, #720] -mul v26.4S, v26.4S,v25.s[1] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v24.s[2] -ldr q0, [x0, #464] -mul v21.4S, v21.4S,v25.s[1] -mla v21.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v24.s[2] -ldr q15, [x0, #336] -mul v17.4S, v17.4S,v25.s[2] -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v22.s[0] -ldr q14, [x0, #80] -mul v18.4S, v18.4S,v25.s[2] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v29.4s, v17.4s -add v29.4s, v29.4s, v17.4s -sqrdmulh v17.4S, v28.4S, v22.s[1] -ldr q13, [x0, #208] -mul v19.4S, v19.4S,v23.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -sqrdmulh v18.4S, v26.4S, v22.s[3] -mul v28.4S, v28.4S,v23.s[1] -mla v28.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v22.s[2] -mul v26.4S, v26.4S,v23.s[3] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v27.4s, v28.4s -add v27.4s, v27.4s, v28.4s -sqrdmulh v28.4S, v16.4S, v24.s[0] -mul v29.4S, v29.4S,v23.s[2] -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v26.4s -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v3.4S, v24.s[0] -mul v16.4S, v16.4S,v25.s[0] -mla v16.4S, v28.4S, v31.s[0] -sub v28.4s, v30.4s, v29.4s -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v2.4S, v24.s[0] -ldr q12, [x0, #992] -mul v3.4S, v3.4S,v25.s[0] -mla v3.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v24.s[0] -ldr q11, [x0, #864] -mul v2.4S, v2.4S,v25.s[0] -mla v2.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v3.4s -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v24.s[1] -str q20, [x0, #64] -ldr q20, [x0, #608] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v15.4S, v24.s[1] -str q17, [x0, #192] -ldr q17, [x0, #736] -mul v0.4S, v0.4S,v25.s[1] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v26.4S, v24.s[2] -str q27, [x0, #320] -ldr q27, [x0, #480] -mul v15.4S, v15.4S,v25.s[1] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v0.4s -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v29.4S, v24.s[2] -str q18, [x0, #448] -ldr q18, [x0, #352] -mul v26.4S, v26.4S,v25.s[2] -mla v26.4S, v1.4S, v31.s[0] -sub v1.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v13.4S, v22.s[0] -str q21, [x0, #832] -ldr q21, [x0, #96] -mul v29.4S, v29.4S,v25.s[2] -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v2.4S, v22.s[1] -str q19, [x0, #960] -ldr q19, [x0, #224] -mul v13.4S, v13.4S,v23.s[0] -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v22.s[3] -str q30, [x0, #576] -mul v2.4S, v2.4S,v23.s[1] -mla v2.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v3.4S, v22.s[2] -str q28, [x0, #704] -mul v0.4S, v0.4S,v23.s[3] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v12.4S, v24.s[0] -mul v3.4S, v3.4S,v23.s[2] -mla v3.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v11.4S, v24.s[0] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v24.s[0] -ldr q28, [x0, #1008] -mul v11.4S, v11.4S,v25.s[0] -mla v11.4S, v0.4S, v31.s[0] -sub v0.4s, v27.4s, v12.4s -add v27.4s, v27.4s, v12.4s -sqrdmulh v12.4S, v17.4S, v24.s[0] -ldr q30, [x0, #880] -mul v20.4S, v20.4S,v25.s[0] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -sqrdmulh v11.4S, v27.4S, v24.s[1] -str q14, [x0, #80] -ldr q14, [x0, #624] -mul v17.4S, v17.4S,v25.s[0] -mla v17.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v24.s[1] -str q26, [x0, #208] -ldr q26, [x0, #752] -mul v27.4S, v27.4S,v25.s[1] -mla v27.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v24.s[2] -str q1, [x0, #336] -ldr q1, [x0, #496] -mul v18.4S, v18.4S,v25.s[1] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v24.s[2] -str q29, [x0, #464] -ldr q29, [x0, #368] -mul v0.4S, v0.4S,v25.s[2] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v22.s[0] -str q15, [x0, #848] -ldr q15, [x0, #112] -mul v3.4S, v3.4S,v25.s[2] -mla v3.4S, v27.4S, v31.s[0] -sub v27.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v22.s[1] -str q13, [x0, #976] -ldr q13, [x0, #240] -mul v19.4S, v19.4S,v23.s[0] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v3.4s -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v27.4S, v22.s[3] -str q16, [x0, #592] -mul v20.4S, v20.4S,v23.s[1] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v11.4S, v22.s[2] -str q2, [x0, #720] -mul v27.4S, v27.4S,v23.s[3] -mla v27.4S, v3.4S, v31.s[0] -sub v3.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v24.s[0] -mul v11.4S, v11.4S,v23.s[2] -mla v11.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v30.4S, v24.s[0] -mul v28.4S, v28.4S,v25.s[0] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v11.4s -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v24.s[0] -ldr q2, [x0, #896] -mul v30.4S, v30.4S,v25.s[0] -mla v30.4S, v27.4S, v31.s[0] -sub v27.4s, v1.4s, v28.4s -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v24.s[0] -ldr q16, [x0, #768] -mul v14.4S, v14.4S,v25.s[0] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v30.4s -add v29.4s, v29.4s, v30.4s -sqrdmulh v30.4S, v1.4S, v24.s[1] -str q21, [x0, #96] -ldr q21, [x0, #512] -mul v26.4S, v26.4S,v25.s[0] -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v29.4S, v24.s[1] -str q0, [x0, #224] -ldr q0, [x0, #640] -mul v1.4S, v1.4S,v25.s[1] -mla v1.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v26.4s -add v13.4s, v13.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v24.s[2] -str q17, [x0, #352] -ldr q17, [x0, #384] -mul v29.4S, v29.4S,v25.s[1] -mla v29.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v11.4S, v24.s[2] -str q3, [x0, #480] -ldr q3, [x0, #256] -mul v27.4S, v27.4S,v25.s[2] -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v22.s[0] -str q18, [x0, #864] -ldr q18, [x0, #0] -mul v11.4S, v11.4S,v25.s[2] -mla v11.4S, v1.4S, v31.s[0] -sub v1.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -sqrdmulh v27.4S, v14.4S, v22.s[1] -str q19, [x0, #992] -ldr q19, [x0, #128] -mul v13.4S, v13.4S,v23.s[0] -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v28.4s, v11.4s -add v28.4s, v28.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v22.s[3] -str q12, [x0, #608] -mul v14.4S, v14.4S,v23.s[1] -mla v14.4S, v27.4S, v31.s[0] -sub v27.4s, v15.4s, v13.4s -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v30.4S, v22.s[2] -str q20, [x0, #736] -mul v1.4S, v1.4S,v23.s[3] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v24.s[0] -mul v30.4S, v30.4S,v23.s[2] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v29.4s, v1.4s -add v29.4s, v29.4s, v1.4s -sqrdmulh v1.4S, v16.4S, v24.s[0] -mul v2.4S, v2.4S,v25.s[0] -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v21.4S, v24.s[0] -ldr q20, [x0, #912] -mul v16.4S, v16.4S,v25.s[0] -mla v16.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v24.s[0] -ldr q12, [x0, #784] -mul v21.4S, v21.4S,v25.s[0] -mla v21.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v24.s[1] -str q15, [x0, #112] -ldr q15, [x0, #528] -mul v0.4S, v0.4S,v25.s[0] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v24.s[1] -str q27, [x0, #240] -ldr q27, [x0, #656] -mul v17.4S, v17.4S,v25.s[1] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v24.s[2] -str q26, [x0, #368] -ldr q26, [x0, #400] -mul v3.4S, v3.4S,v25.s[1] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v30.4S, v24.s[2] -str q11, [x0, #496] -ldr q11, [x0, #272] -mul v1.4S, v1.4S,v25.s[2] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v22.s[0] -str q29, [x0, #880] -ldr q29, [x0, #16] -mul v30.4S, v30.4S,v25.s[2] -mla v30.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v22.s[1] -str q13, [x0, #1008] -ldr q13, [x0, #144] -mul v19.4S, v19.4S,v23.s[0] -mla v19.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v30.4s -add v2.4s, v2.4s, v30.4s -sqrdmulh v30.4S, v17.4S, v22.s[3] -str q28, [x0, #624] -mul v21.4S, v21.4S,v23.s[1] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v22.s[2] -str q14, [x0, #752] -mul v17.4S, v17.4S,v23.s[3] -mla v17.4S, v30.4S, v31.s[0] -sub v30.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v24.s[0] -mul v16.4S, v16.4S,v23.s[2] -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v24.s[0] -mul v20.4S, v20.4S,v25.s[0] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v24.s[0] -ldr q14, [x0, #928] -mul v12.4S, v12.4S,v25.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v27.4S, v24.s[0] -ldr q28, [x0, #800] -mul v15.4S, v15.4S,v25.s[0] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v24.s[1] -str q18, [x0, #0] -ldr q18, [x0, #544] -mul v27.4S, v27.4S,v25.s[0] -mla v27.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v15.4s -add v29.4s, v29.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v24.s[1] -str q1, [x0, #128] -ldr q1, [x0, #672] -mul v26.4S, v26.4S,v25.s[1] -mla v26.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v27.4s -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v24.s[2] -str q0, [x0, #256] -ldr q0, [x0, #416] -mul v11.4S, v11.4S,v25.s[1] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v13.4s, v26.4s -add v13.4s, v13.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v24.s[2] -str q30, [x0, #384] -ldr q30, [x0, #288] -mul v17.4S, v17.4S,v25.s[2] -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v22.s[0] -str q3, [x0, #768] -ldr q3, [x0, #32] -mul v16.4S, v16.4S,v25.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -sqrdmulh v17.4S, v15.4S, v22.s[1] -str q19, [x0, #896] -ldr q19, [x0, #160] -mul v13.4S, v13.4S,v23.s[0] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v22.s[3] -str q2, [x0, #512] -mul v15.4S, v15.4S,v23.s[1] -mla v15.4S, v17.4S, v31.s[0] -sub v17.4s, v29.4s, v13.4s -add v29.4s, v29.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v22.s[2] -str q21, [x0, #640] -mul v26.4S, v26.4S,v23.s[3] -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v15.4s -add v27.4s, v27.4s, v15.4s -sqrdmulh v15.4S, v14.4S, v24.s[0] -mul v12.4S, v12.4S,v23.s[2] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v26.4s -add v11.4s, v11.4s, v26.4s -sqrdmulh v26.4S, v28.4S, v24.s[0] -mul v14.4S, v14.4S,v25.s[0] -mla v14.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v24.s[0] -ldr q21, [x0, #944] -mul v28.4S, v28.4S,v25.s[0] -mla v28.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v1.4S, v24.s[0] -ldr q2, [x0, #816] -mul v18.4S, v18.4S,v25.s[0] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v0.4S, v24.s[1] -str q29, [x0, #16] -ldr q29, [x0, #560] -mul v1.4S, v1.4S,v25.s[0] -mla v1.4S, v14.4S, v31.s[0] -sub v14.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v24.s[1] -str q17, [x0, #144] -ldr q17, [x0, #688] -mul v0.4S, v0.4S,v25.s[1] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v26.4S, v24.s[2] -str q27, [x0, #272] -ldr q27, [x0, #432] -mul v30.4S, v30.4S,v25.s[1] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v0.4s -add v19.4s, v19.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v24.s[2] -str q16, [x0, #400] -ldr q16, [x0, #304] -mul v26.4S, v26.4S,v25.s[2] -mla v26.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v30.4s -add v3.4s, v3.4s, v30.4s -sqrdmulh v30.4S, v19.4S, v22.s[0] -str q11, [x0, #784] -ldr q11, [x0, #48] -mul v12.4S, v12.4S,v25.s[2] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v22.s[1] -str q13, [x0, #912] -ldr q13, [x0, #176] -mul v19.4S, v19.4S,v23.s[0] -mla v19.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v0.4S, v22.s[3] -str q20, [x0, #528] -mul v18.4S, v18.4S,v23.s[1] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v22.s[2] -str q15, [x0, #656] -mul v0.4S, v0.4S,v23.s[3] -mla v0.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v24.s[0] -mul v28.4S, v28.4S,v23.s[2] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v0.4s -add v30.4s, v30.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v24.s[0] -mul v21.4S, v21.4S,v25.s[0] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v28.4s -add v14.4s, v14.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v24.s[0] -mul v2.4S, v2.4S,v25.s[0] -mla v2.4S, v0.4S, v31.s[0] -sub v0.4s, v27.4s, v21.4s -add v27.4s, v27.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v24.s[0] -mul v29.4S, v29.4S,v25.s[0] -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v2.4s -add v16.4s, v16.4s, v2.4s -sqrdmulh v2.4S, v27.4S, v24.s[1] -str q3, [x0, #32] -mul v17.4S, v17.4S,v25.s[0] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v29.4s -add v11.4s, v11.4s, v29.4s -sqrdmulh v29.4S, v16.4S, v24.s[1] -str q26, [x0, #160] -mul v27.4S, v27.4S,v25.s[1] -mla v27.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v24.s[2] -str q1, [x0, #288] -mul v16.4S, v16.4S,v25.s[1] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v27.4s -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v28.4S, v24.s[2] -str q12, [x0, #416] -mul v0.4S, v0.4S,v25.s[2] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v22.s[0] -str q30, [x0, #800] -mul v28.4S, v28.4S,v25.s[2] -mla v28.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v29.4S, v22.s[1] -str q19, [x0, #928] -mul v13.4S, v13.4S,v23.s[0] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v22.s[3] -str q14, [x0, #544] -mul v29.4S, v29.4S,v23.s[1] -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v2.4S, v22.s[2] -str q18, [x0, #672] -mul v27.4S, v27.4S,v23.s[3] -mla v27.4S, v28.4S, v31.s[0] -sub v28.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -mul v2.4S, v2.4S,v23.s[2] -mla v2.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v27.4s -add v16.4s, v16.4s, v27.4s -sub v27.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -str q11, [x0, #48] -str q0, [x0, #176] -str q17, [x0, #304] -str q28, [x0, #432] -str q16, [x0, #816] -str q13, [x0, #944] -str q21, [x0, #560] -str q27, [x0, #688] -ldr q4, [x0, #112] -ldr q5, [x0, #96] -ldr q6, [x0, #64] -ldr q7, [x0, #80] -ldr q8, [x0, #48] -ldr q9, [x17, #+64] -ldr q10, [x17, #+80] -ldr q20, [x17, #+96] -ldr q15, [x17, #+112] -ldr q3, [x0, #32] -ldr q26, [x0, #0] -ldr q1, [x0, #16] -sqrdmulh v12.4S, v4.4S, v10.s[0] -sqrdmulh v30.4S, v5.4S, v10.s[0] -mul v4.4S, v4.4S,v9.s[0] -mla v4.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v6.4S, v10.s[0] -ldr q19, [x0, #240] -mul v5.4S, v5.4S,v9.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v7.4S, v10.s[0] -ldr q14, [x0, #224] -mul v6.4S, v6.4S,v9.s[0] -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v5.4s -add v3.4s, v3.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v10.s[1] -ldr q18, [x0, #192] -mul v7.4S, v7.4S,v9.s[0] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v6.4s -add v26.4s, v26.4s, v6.4s -sqrdmulh v6.4S, v3.4S, v10.s[1] -ldr q29, [x0, #208] -mul v8.4S, v8.4S,v9.s[1] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v7.4s -add v1.4s, v1.4s, v7.4s -sqrdmulh v7.4S, v30.4S, v10.s[2] -ldr q2, [x0, #176] -mul v3.4S, v3.4S,v9.s[1] -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v1.4s, v8.4s -add v1.4s, v1.4s, v8.4s -ldr q8, [x17, #+128] -ldr q25, [x17, #+144] -ldr q24, [x17, #+160] -ldr q23, [x17, #+176] -sqrdmulh v22.4S, v12.4S, v10.s[2] -ldr q11, [x0, #160] -mul v30.4S, v30.4S,v9.s[2] -mla v30.4S, v7.4S, v31.s[0] -sub v7.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v15.s[0] -ldr q0, [x0, #128] -mul v12.4S, v12.4S,v9.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v6.4S, v15.s[1] -ldr q17, [x0, #144] -mul v1.4S, v1.4S,v20.s[0] -mla v1.4S, v3.4S, v31.s[0] -sub v3.4s, v4.4s, v12.4s -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v15.s[3] -mul v6.4S, v6.4S,v20.s[1] -mla v6.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v1.4s -add v26.4s, v26.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v15.s[2] -mul v22.4S, v22.4S,v20.s[3] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v6.4s -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v19.4S, v25.s[0] -mul v5.4S, v5.4S,v20.s[2] -mla v5.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v25.s[0] -mul v19.4S, v19.4S,v8.s[0] -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v15.4S, v18.4S, v25.s[0] -ldr q20, [x0, #368] -mul v14.4S, v14.4S,v8.s[0] -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v29.4S, v25.s[0] -ldr q10, [x0, #352] -mul v18.4S, v18.4S,v8.s[0] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v25.s[1] -str q26, [x0, #0] -ldr q26, [x0, #320] -mul v29.4S, v29.4S,v8.s[0] -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v25.s[1] -str q30, [x0, #16] -ldr q30, [x0, #336] -mul v2.4S, v2.4S,v8.s[1] -mla v2.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v25.s[2] -str q7, [x0, #32] -ldr q7, [x0, #304] -mul v11.4S, v11.4S,v8.s[1] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -ldr q2, [x17, #+192] -ldr q9, [x17, #+208] -ldr q5, [x17, #+224] -ldr q28, [x17, #+240] -sqrdmulh v16.4S, v15.4S, v25.s[2] -str q12, [x0, #48] -ldr q12, [x0, #288] -mul v22.4S, v22.4S,v8.s[2] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v0.4s, v11.4s -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v23.s[0] -str q3, [x0, #96] -ldr q3, [x0, #256] -mul v15.4S, v15.4S,v8.s[2] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v23.s[1] -str q1, [x0, #112] -ldr q1, [x0, #272] -mul v17.4S, v17.4S,v24.s[0] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[3] -str q4, [x0, #64] -mul v18.4S, v18.4S,v24.s[1] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v23.s[2] -str q6, [x0, #80] -mul v16.4S, v16.4S,v24.s[3] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v18.4s -add v29.4s, v29.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v9.s[0] -mul v14.4S, v14.4S,v24.s[2] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v10.4S, v9.s[0] -mul v20.4S, v20.4S,v2.s[0] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v23.4S, v26.4S, v9.s[0] -ldr q24, [x0, #496] -mul v10.4S, v10.4S,v2.s[0] -mla v10.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v20.4s -add v7.4s, v7.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v9.s[0] -ldr q25, [x0, #480] -mul v26.4S, v26.4S,v2.s[0] -mla v26.4S, v23.4S, v31.s[0] -sub v23.4s, v12.4s, v10.4s -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v7.4S, v9.s[1] -str q0, [x0, #128] -ldr q0, [x0, #448] -mul v30.4S, v30.4S,v2.s[0] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v9.s[1] -str q22, [x0, #144] -ldr q22, [x0, #464] -mul v7.4S, v7.4S,v2.s[1] -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v9.s[2] -str q29, [x0, #160] -ldr q29, [x0, #432] -mul v12.4S, v12.4S,v2.s[1] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v1.4s, v7.4s -add v1.4s, v1.4s, v7.4s -ldr q7, [x17, #+256] -ldr q8, [x17, #+272] -ldr q14, [x17, #+288] -ldr q6, [x17, #+304] -sqrdmulh v4.4S, v23.4S, v9.s[2] -str q15, [x0, #176] -ldr q15, [x0, #416] -mul v16.4S, v16.4S,v2.s[2] -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v28.s[0] -str q11, [x0, #224] -ldr q11, [x0, #384] -mul v23.4S, v23.4S,v2.s[2] -mla v23.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v28.s[1] -str q17, [x0, #240] -ldr q17, [x0, #400] -mul v1.4S, v1.4S,v5.s[0] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v20.4s, v23.4s -add v20.4s, v20.4s, v23.4s -sqrdmulh v23.4S, v4.4S, v28.s[3] -str q19, [x0, #192] -mul v26.4S, v26.4S,v5.s[1] -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v1.4s -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v28.s[2] -str q18, [x0, #208] -mul v4.4S, v4.4S,v5.s[3] -mla v4.4S, v23.4S, v31.s[0] -sub v23.4s, v30.4s, v26.4s -add v30.4s, v30.4s, v26.4s -sqrdmulh v26.4S, v24.4S, v8.s[0] -mul v10.4S, v10.4S,v5.s[2] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v4.4s -add v12.4s, v12.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v8.s[0] -mul v24.4S, v24.4S,v7.s[0] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v28.4S, v0.4S, v8.s[0] -ldr q5, [x0, #624] -mul v25.4S, v25.4S,v7.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v24.4S, v22.4S, v8.s[0] -ldr q9, [x0, #608] -mul v0.4S, v0.4S,v7.s[0] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v15.4s, v25.4s -add v15.4s, v15.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v8.s[1] -str q3, [x0, #256] -ldr q3, [x0, #576] -mul v22.4S, v22.4S,v7.s[0] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v8.s[1] -str q16, [x0, #272] -ldr q16, [x0, #592] -mul v29.4S, v29.4S,v7.s[1] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v4.4S, v8.s[2] -str q30, [x0, #288] -ldr q30, [x0, #560] -mul v15.4S, v15.4S,v7.s[1] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v29.4s -add v17.4s, v17.4s, v29.4s -ldr q29, [x17, #+320] -ldr q2, [x17, #+336] -ldr q10, [x17, #+352] -ldr q18, [x17, #+368] -sqrdmulh v19.4S, v28.4S, v8.s[2] -str q23, [x0, #304] -ldr q23, [x0, #544] -mul v4.4S, v4.4S,v7.s[2] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v17.4S, v6.s[0] -str q12, [x0, #352] -ldr q12, [x0, #512] -mul v28.4S, v28.4S,v7.s[2] -mla v28.4S, v19.4S, v31.s[0] -sub v19.4s, v25.4s, v4.4s -add v25.4s, v25.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v6.s[1] -str q1, [x0, #368] -ldr q1, [x0, #528] -mul v17.4S, v17.4S,v14.s[0] -mla v17.4S, v15.4S, v31.s[0] -sub v15.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v6.s[3] -str q20, [x0, #320] -mul v0.4S, v0.4S,v14.s[1] -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v6.s[2] -str q26, [x0, #336] -mul v19.4S, v19.4S,v14.s[3] -mla v19.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v2.s[0] -mul v25.4S, v25.4S,v14.s[2] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v19.4s -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v9.4S, v2.s[0] -mul v5.4S, v5.4S,v29.s[0] -mla v5.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -sqrdmulh v6.4S, v3.4S, v2.s[0] -ldr q14, [x0, #752] -mul v9.4S, v9.4S,v29.s[0] -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v5.4s -add v30.4s, v30.4s, v5.4s -sqrdmulh v5.4S, v16.4S, v2.s[0] -ldr q8, [x0, #736] -mul v3.4S, v3.4S,v29.s[0] -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v23.4s, v9.4s -add v23.4s, v23.4s, v9.4s -sqrdmulh v9.4S, v30.4S, v2.s[1] -str q11, [x0, #384] -ldr q11, [x0, #704] -mul v16.4S, v16.4S,v29.s[0] -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v3.4s -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v23.4S, v2.s[1] -str q4, [x0, #400] -ldr q4, [x0, #720] -mul v30.4S, v30.4S,v29.s[1] -mla v30.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v2.s[2] -str q22, [x0, #416] -ldr q22, [x0, #688] -mul v23.4S, v23.4S,v29.s[1] -mla v23.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v30.4s -add v1.4s, v1.4s, v30.4s -ldr q30, [x17, #+384] -ldr q7, [x17, #+400] -ldr q25, [x17, #+416] -ldr q26, [x17, #+432] -sqrdmulh v20.4S, v6.4S, v2.s[2] -str q28, [x0, #432] -ldr q28, [x0, #672] -mul v19.4S, v19.4S,v29.s[2] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v23.4s -add v12.4s, v12.4s, v23.4s -sqrdmulh v23.4S, v1.4S, v18.s[0] -str q15, [x0, #480] -ldr q15, [x0, #640] -mul v6.4S, v6.4S,v29.s[2] -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v9.4s, v19.4s -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v18.s[1] -str q17, [x0, #496] -ldr q17, [x0, #656] -mul v1.4S, v1.4S,v10.s[0] -mla v1.4S, v23.4S, v31.s[0] -sub v23.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v20.4S, v18.s[3] -str q24, [x0, #448] -mul v3.4S, v3.4S,v10.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v9.4S, v18.s[2] -str q0, [x0, #464] -mul v20.4S, v20.4S,v10.s[3] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v7.s[0] -mul v9.4S, v9.4S,v10.s[2] -mla v9.4S, v1.4S, v31.s[0] -sub v1.4s, v23.4s, v20.4s -add v23.4s, v23.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v7.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -sqrdmulh v18.4S, v11.4S, v7.s[0] -ldr q10, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -mla v8.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v4.4S, v7.s[0] -ldr q2, [x0, #864] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v7.s[1] -str q12, [x0, #512] -ldr q12, [x0, #832] -mul v4.4S, v4.4S,v30.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v28.4S, v7.s[1] -str q19, [x0, #528] -ldr q19, [x0, #848] -mul v22.4S, v22.4S,v30.s[1] -mla v22.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v7.s[2] -str q16, [x0, #544] -ldr q16, [x0, #816] -mul v28.4S, v28.4S,v30.s[1] -mla v28.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -ldr q22, [x17, #+448] -ldr q29, [x17, #+464] -ldr q9, [x17, #+480] -ldr q0, [x17, #+496] -sqrdmulh v24.4S, v18.4S, v7.s[2] -str q6, [x0, #560] -ldr q6, [x0, #800] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v15.4s, v28.4s -add v15.4s, v15.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v26.s[0] -str q23, [x0, #608] -ldr q23, [x0, #768] -mul v18.4S, v18.4S,v30.s[2] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v26.s[1] -str q1, [x0, #624] -ldr q1, [x0, #784] -mul v17.4S, v17.4S,v25.s[0] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v24.4S, v26.s[3] -str q5, [x0, #576] -mul v11.4S, v11.4S,v25.s[1] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v8.4S, v26.s[2] -str q3, [x0, #592] -mul v24.4S, v24.4S,v25.s[3] -mla v24.4S, v18.4S, v31.s[0] -sub v18.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v10.4S, v29.s[0] -mul v8.4S, v8.4S,v25.s[2] -mla v8.4S, v17.4S, v31.s[0] -sub v17.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v2.4S, v29.s[0] -mul v10.4S, v10.4S,v22.s[0] -mla v10.4S, v11.4S, v31.s[0] -sub v11.4s, v14.4s, v8.4s -add v14.4s, v14.4s, v8.4s -sqrdmulh v26.4S, v12.4S, v29.s[0] -ldr q25, [x0, #1008] -mul v2.4S, v2.4S,v22.s[0] -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v29.s[0] -ldr q7, [x0, #992] -mul v12.4S, v12.4S,v22.s[0] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v6.4s, v2.4s -add v6.4s, v6.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v29.s[1] -str q15, [x0, #640] -ldr q15, [x0, #960] -mul v19.4S, v19.4S,v22.s[0] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v23.4s, v12.4s -add v23.4s, v23.4s, v12.4s -sqrdmulh v12.4S, v6.4S, v29.s[1] -str q20, [x0, #656] -ldr q20, [x0, #976] -mul v16.4S, v16.4S,v22.s[1] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v19.4s -add v1.4s, v1.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v29.s[2] -str q4, [x0, #672] -ldr q4, [x0, #944] -mul v6.4S, v6.4S,v22.s[1] -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -ldr q16, [x17, #+512] -ldr q30, [x17, #+528] -ldr q8, [x17, #+544] -ldr q3, [x17, #+560] -sqrdmulh v5.4S, v26.4S, v29.s[2] -str q18, [x0, #688] -ldr q18, [x0, #928] -mul v24.4S, v24.4S,v22.s[2] -mla v24.4S, v19.4S, v31.s[0] -sub v19.4s, v23.4s, v6.4s -add v23.4s, v23.4s, v6.4s -sqrdmulh v6.4S, v1.4S, v0.s[0] -str q28, [x0, #736] -ldr q28, [x0, #896] -mul v26.4S, v26.4S,v22.s[2] -mla v26.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v0.s[1] -str q17, [x0, #752] -ldr q17, [x0, #912] -mul v1.4S, v1.4S,v9.s[0] -mla v1.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -sqrdmulh v26.4S, v5.4S, v0.s[3] -str q14, [x0, #704] -mul v12.4S, v12.4S,v9.s[1] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v1.4s -add v23.4s, v23.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v0.s[2] -str q11, [x0, #720] -mul v5.4S, v5.4S,v9.s[3] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v19.4s, v12.4s -add v19.4s, v19.4s, v12.4s -sqrdmulh v12.4S, v25.4S, v30.s[0] -mul v2.4S, v2.4S,v9.s[2] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v30.s[0] -mul v25.4S, v25.4S,v16.s[0] -mla v25.4S, v12.4S, v31.s[0] -sub v12.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v0.4S, v15.4S, v30.s[0] -mul v7.4S, v7.4S,v16.s[0] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v30.s[0] -mul v15.4S, v15.4S,v16.s[0] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v18.4s, v7.4s -add v18.4s, v18.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v30.s[1] -str q23, [x0, #768] -mul v20.4S, v20.4S,v16.s[0] -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v15.4s -add v28.4s, v28.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v30.s[1] -str q24, [x0, #784] -mul v4.4S, v4.4S,v16.s[1] -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v5.4S, v30.s[2] -str q19, [x0, #800] -mul v18.4S, v18.4S,v16.s[1] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v30.s[2] -str q26, [x0, #816] -mul v5.4S, v5.4S,v16.s[2] -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v3.s[0] -str q6, [x0, #864] -mul v0.4S, v0.4S,v16.s[2] -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v15.4S, v3.s[1] -str q1, [x0, #880] -mul v17.4S, v17.4S,v8.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v0.4s -add v25.4s, v25.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v3.s[3] -str q10, [x0, #832] -mul v15.4S, v15.4S,v8.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v28.4s, v17.4s -add v28.4s, v28.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v3.s[2] -str q12, [x0, #848] -mul v4.4S, v4.4S,v8.s[3] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -mul v7.4S, v7.4S,v8.s[2] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v4.4s -add v18.4s, v18.4s, v4.4s -sub v4.4s, v25.4s, v7.4s -add v25.4s, v25.4s, v7.4s -str q28, [x0, #896] -str q5, [x0, #912] -str q20, [x0, #928] -str q0, [x0, #944] -str q18, [x0, #992] -str q17, [x0, #1008] -str q25, [x0, #960] -str q4, [x0, #976] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1444 -// Instruction count: 1440 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s deleted file mode 100644 index 209eccf..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_0.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_0_0 -.global _ntt_u32_incomplete_neon_asm_var_4_2_0_0 -ntt_u32_incomplete_neon_asm_var_4_2_0_0: -_ntt_u32_incomplete_neon_asm_var_4_2_0_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -ldr q2, [x0, #544] -ldr q1, [x0, #608] -ldr q0, [x0, #672] -ldr q15, [x0, #736] -ldr q14, [x0, #32] -ldr q13, [x0, #96] -ldr q12, [x0, #160] -ldr q11, [x0, #224] -sqrdmulh v10.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v25.s[0] -mul v13.4S, v13.4S,v26.s[0] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v25.s[3] -mul v15.4S, v15.4S,v26.s[3] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v23.s[1] -mul v1.4S, v1.4S,v24.s[1] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v23.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -str q14, [x0, #32] -str q21, [x0, #96] -str q22, [x0, #160] -str q13, [x0, #224] -str q3, [x0, #288] -str q12, [x0, #352] -str q11, [x0, #416] -str q18, [x0, #480] -str q19, [x0, #544] -str q15, [x0, #608] -str q16, [x0, #672] -str q2, [x0, #736] -str q20, [x0, #800] -str q1, [x0, #864] -str q0, [x0, #928] -str q10, [x0, #992] -ldr q10, [x0, #816] -ldr q0, [x0, #880] -ldr q1, [x0, #944] -ldr q20, [x0, #1008] -ldr q2, [x0, #304] -ldr q16, [x0, #368] -ldr q15, [x0, #432] -ldr q19, [x0, #496] -ldr q18, [x0, #560] -ldr q11, [x0, #624] -ldr q12, [x0, #688] -ldr q3, [x0, #752] -ldr q13, [x0, #48] -ldr q22, [x0, #112] -ldr q21, [x0, #176] -ldr q14, [x0, #240] -sqrdmulh v17.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v21.4S, v27.s[0] -mul v21.4S, v21.4S,v28.s[0] -mla v21.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v27.s[3] -mul v16.4S, v16.4S,v28.s[3] -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v27.s[3] -mul v0.4S, v0.4S,v28.s[3] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v25.s[0] -mul v22.4S, v22.4S,v26.s[0] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v26.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v25.s[3] -mul v3.4S, v3.4S,v26.s[3] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v23.s[1] -mul v11.4S, v11.4S,v24.s[1] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v23.s[2] -mul v17.4S, v17.4S,v24.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v23.s[3] -mul v16.4S, v16.4S,v24.s[3] -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -str q13, [x0, #48] -str q0, [x0, #112] -str q10, [x0, #176] -str q22, [x0, #240] -str q19, [x0, #304] -str q21, [x0, #368] -str q14, [x0, #432] -str q2, [x0, #496] -str q20, [x0, #560] -str q3, [x0, #624] -str q15, [x0, #688] -str q18, [x0, #752] -str q1, [x0, #816] -str q11, [x0, #880] -str q12, [x0, #944] -str q17, [x0, #1008] -ldr q17, [x0, #768] -ldr q12, [x0, #832] -ldr q11, [x0, #896] -ldr q1, [x0, #960] -ldr q18, [x0, #256] -ldr q15, [x0, #320] -ldr q3, [x0, #384] -ldr q20, [x0, #448] -ldr q2, [x0, #512] -ldr q14, [x0, #576] -ldr q21, [x0, #640] -ldr q19, [x0, #704] -ldr q22, [x0, #0] -ldr q10, [x0, #64] -ldr q0, [x0, #128] -ldr q13, [x0, #192] -sqrdmulh v16.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v27.s[2] -mul v14.4S, v14.4S,v28.s[2] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v25.s[1] -mul v0.4S, v0.4S,v26.s[1] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v25.s[3] -mul v19.4S, v19.4S,v26.s[3] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v23.s[1] -mul v14.4S, v14.4S,v24.s[1] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -mla v16.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v23.s[3] -mul v15.4S, v15.4S,v24.s[3] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -str q22, [x0, #0] -str q12, [x0, #64] -str q17, [x0, #128] -str q10, [x0, #192] -str q20, [x0, #256] -str q0, [x0, #320] -str q13, [x0, #384] -str q18, [x0, #448] -str q1, [x0, #512] -str q19, [x0, #576] -str q3, [x0, #640] -str q2, [x0, #704] -str q11, [x0, #768] -str q14, [x0, #832] -str q21, [x0, #896] -str q16, [x0, #960] -ldr q16, [x0, #784] -ldr q21, [x0, #848] -ldr q14, [x0, #912] -ldr q11, [x0, #976] -ldr q2, [x0, #272] -ldr q3, [x0, #336] -ldr q19, [x0, #400] -ldr q1, [x0, #464] -ldr q18, [x0, #528] -ldr q13, [x0, #592] -ldr q0, [x0, #656] -ldr q20, [x0, #720] -ldr q10, [x0, #16] -ldr q17, [x0, #80] -ldr q12, [x0, #144] -ldr q22, [x0, #208] -sqrdmulh v15.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -mla v1.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v20.4s -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v26.s[0] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v25.s[3] -mul v20.4S, v20.4S,v26.s[3] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -str q10, [x0, #16] -str q21, [x0, #80] -str q16, [x0, #144] -str q17, [x0, #208] -str q1, [x0, #272] -str q12, [x0, #336] -str q22, [x0, #400] -str q2, [x0, #464] -str q11, [x0, #528] -str q20, [x0, #592] -str q19, [x0, #656] -str q18, [x0, #720] -str q14, [x0, #784] -str q13, [x0, #848] -str q0, [x0, #912] -str q15, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x0, #32] -ldr q7, [x0, #48] -ldr q8, [x0, #0] -ldr q9, [x0, #16] -sqrdmulh v3.4S, v6.4S, v5.s[0] -mul v6.4S, v6.4S,v4.s[0] -mla v6.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v5.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -ldr q7, [x17, #+160] -ldr q10, [x17, #+176] -sqrdmulh v21.4S, v9.4S, v5.s[1] -mul v9.4S, v9.4S,v4.s[1] -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v6.4S, v5.s[2] -mul v6.4S, v6.4S,v4.s[2] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v6.4s -add v3.4s, v3.4s, v6.4s -str q8, [x0, #0] -str q21, [x0, #16] -str q3, [x0, #32] -str q9, [x0, #48] -ldr q5, [x0, #96] -ldr q4, [x0, #112] -ldr q9, [x0, #64] -ldr q3, [x0, #80] -sqrdmulh v21.4S, v5.4S, v10.s[0] -mul v5.4S, v5.4S,v7.s[0] -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v10.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v4.4s -add v3.4s, v3.4s, v4.4s -ldr q4, [x17, #+192] -ldr q8, [x17, #+208] -sqrdmulh v6.4S, v3.4S, v10.s[1] -mul v3.4S, v3.4S,v7.s[1] -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v5.4S, v10.s[2] -mul v5.4S, v5.4S,v7.s[2] -mla v5.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -str q9, [x0, #64] -str q6, [x0, #80] -str q21, [x0, #96] -str q3, [x0, #112] -ldr q10, [x0, #160] -ldr q7, [x0, #176] -ldr q3, [x0, #128] -ldr q21, [x0, #144] -sqrdmulh v6.4S, v10.4S, v8.s[0] -mul v10.4S, v10.4S,v4.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v10.4s -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v7.4S, v8.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -ldr q7, [x17, #+224] -ldr q9, [x17, #+240] -sqrdmulh v5.4S, v21.4S, v8.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v8.s[2] -mul v10.4S, v10.4S,v4.s[2] -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -str q3, [x0, #128] -str q5, [x0, #144] -str q6, [x0, #160] -str q21, [x0, #176] -ldr q8, [x0, #224] -ldr q4, [x0, #240] -ldr q21, [x0, #192] -ldr q6, [x0, #208] -sqrdmulh v5.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v7.s[0] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v4.4S, v9.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v8.4S, v31.s[0] -sub v8.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x17, #+256] -ldr q3, [x17, #+272] -sqrdmulh v10.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v7.s[1] -mla v6.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v8.4S, v9.s[2] -mul v8.4S, v8.4S,v7.s[2] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -str q21, [x0, #192] -str q10, [x0, #208] -str q5, [x0, #224] -str q6, [x0, #240] -ldr q9, [x0, #288] -ldr q7, [x0, #304] -ldr q6, [x0, #256] -ldr q5, [x0, #272] -sqrdmulh v10.4S, v9.4S, v3.s[0] -mul v9.4S, v9.4S,v4.s[0] -mla v9.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v7.4S, v3.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -ldr q7, [x17, #+288] -ldr q21, [x17, #+304] -sqrdmulh v8.4S, v5.4S, v3.s[1] -mul v5.4S, v5.4S,v4.s[1] -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v3.s[2] -mul v9.4S, v9.4S,v4.s[2] -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -str q6, [x0, #256] -str q8, [x0, #272] -str q10, [x0, #288] -str q5, [x0, #304] -ldr q3, [x0, #352] -ldr q4, [x0, #368] -ldr q5, [x0, #320] -ldr q10, [x0, #336] -sqrdmulh v8.4S, v3.4S, v21.s[0] -mul v3.4S, v3.4S,v7.s[0] -mla v3.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v3.4s -add v5.4s, v5.4s, v3.4s -sqrdmulh v3.4S, v4.4S, v21.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -ldr q4, [x17, #+320] -ldr q6, [x17, #+336] -sqrdmulh v9.4S, v10.4S, v21.s[1] -mul v10.4S, v10.4S,v7.s[1] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v21.s[2] -mul v3.4S, v3.4S,v7.s[2] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -str q5, [x0, #320] -str q9, [x0, #336] -str q8, [x0, #352] -str q10, [x0, #368] -ldr q21, [x0, #416] -ldr q7, [x0, #432] -ldr q10, [x0, #384] -ldr q8, [x0, #400] -sqrdmulh v9.4S, v21.4S, v6.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v7.4S, v6.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v7.4s -add v8.4s, v8.4s, v7.4s -ldr q7, [x17, #+352] -ldr q5, [x17, #+368] -sqrdmulh v3.4S, v8.4S, v6.s[1] -mul v8.4S, v8.4S,v4.s[1] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v6.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -str q10, [x0, #384] -str q3, [x0, #400] -str q9, [x0, #416] -str q8, [x0, #432] -ldr q6, [x0, #480] -ldr q4, [x0, #496] -ldr q8, [x0, #448] -ldr q9, [x0, #464] -sqrdmulh v3.4S, v6.4S, v5.s[0] -mul v6.4S, v6.4S,v7.s[0] -mla v6.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v4.4S, v5.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v4.4s -add v9.4s, v9.4s, v4.4s -ldr q4, [x17, #+384] -ldr q10, [x17, #+400] -sqrdmulh v21.4S, v9.4S, v5.s[1] -mul v9.4S, v9.4S,v7.s[1] -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v6.4S, v5.s[2] -mul v6.4S, v6.4S,v7.s[2] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v6.4s -add v3.4s, v3.4s, v6.4s -str q8, [x0, #448] -str q21, [x0, #464] -str q3, [x0, #480] -str q9, [x0, #496] -ldr q5, [x0, #544] -ldr q7, [x0, #560] -ldr q9, [x0, #512] -ldr q3, [x0, #528] -sqrdmulh v21.4S, v5.4S, v10.s[0] -mul v5.4S, v5.4S,v4.s[0] -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v10.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -ldr q7, [x17, #+416] -ldr q8, [x17, #+432] -sqrdmulh v6.4S, v3.4S, v10.s[1] -mul v3.4S, v3.4S,v4.s[1] -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v5.4S, v10.s[2] -mul v5.4S, v5.4S,v4.s[2] -mla v5.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -str q9, [x0, #512] -str q6, [x0, #528] -str q21, [x0, #544] -str q3, [x0, #560] -ldr q10, [x0, #608] -ldr q4, [x0, #624] -ldr q3, [x0, #576] -ldr q21, [x0, #592] -sqrdmulh v6.4S, v10.4S, v8.s[0] -mul v10.4S, v10.4S,v7.s[0] -mla v10.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v10.4s -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v4.4S, v8.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -ldr q4, [x17, #+448] -ldr q9, [x17, #+464] -sqrdmulh v5.4S, v21.4S, v8.s[1] -mul v21.4S, v21.4S,v7.s[1] -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v8.s[2] -mul v10.4S, v10.4S,v7.s[2] -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v6.4s, v10.4s -add v6.4s, v6.4s, v10.4s -str q3, [x0, #576] -str q5, [x0, #592] -str q6, [x0, #608] -str q21, [x0, #624] -ldr q8, [x0, #672] -ldr q7, [x0, #688] -ldr q21, [x0, #640] -ldr q6, [x0, #656] -sqrdmulh v5.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v4.s[0] -mla v8.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v7.4S, v9.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -ldr q7, [x17, #+480] -ldr q3, [x17, #+496] -sqrdmulh v10.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v4.s[1] -mla v6.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v8.4S, v9.s[2] -mul v8.4S, v8.4S,v4.s[2] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -str q21, [x0, #640] -str q10, [x0, #656] -str q5, [x0, #672] -str q6, [x0, #688] -ldr q9, [x0, #736] -ldr q4, [x0, #752] -ldr q6, [x0, #704] -ldr q5, [x0, #720] -sqrdmulh v10.4S, v9.4S, v3.s[0] -mul v9.4S, v9.4S,v7.s[0] -mla v9.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v9.4s -add v6.4s, v6.4s, v9.4s -sqrdmulh v9.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v4.4s -add v5.4s, v5.4s, v4.4s -ldr q4, [x17, #+512] -ldr q21, [x17, #+528] -sqrdmulh v8.4S, v5.4S, v3.s[1] -mul v5.4S, v5.4S,v7.s[1] -mla v5.4S, v8.4S, v31.s[0] -sub v8.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v9.4S, v3.s[2] -mul v9.4S, v9.4S,v7.s[2] -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v9.4s -add v10.4s, v10.4s, v9.4s -str q6, [x0, #704] -str q8, [x0, #720] -str q10, [x0, #736] -str q5, [x0, #752] -ldr q3, [x0, #800] -ldr q7, [x0, #816] -ldr q5, [x0, #768] -ldr q10, [x0, #784] -sqrdmulh v8.4S, v3.4S, v21.s[0] -mul v3.4S, v3.4S,v4.s[0] -mla v3.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v3.4s -add v5.4s, v5.4s, v3.4s -sqrdmulh v3.4S, v7.4S, v21.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v7.4s -add v10.4s, v10.4s, v7.4s -ldr q7, [x17, #+544] -ldr q6, [x17, #+560] -sqrdmulh v9.4S, v10.4S, v21.s[1] -mul v10.4S, v10.4S,v4.s[1] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v21.s[2] -mul v3.4S, v3.4S,v4.s[2] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v8.4s, v3.4s -add v8.4s, v8.4s, v3.4s -str q5, [x0, #768] -str q9, [x0, #784] -str q8, [x0, #800] -str q10, [x0, #816] -ldr q21, [x0, #864] -ldr q4, [x0, #880] -ldr q10, [x0, #832] -ldr q8, [x0, #848] -sqrdmulh v9.4S, v21.4S, v6.s[0] -mul v21.4S, v21.4S,v7.s[0] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v4.4S, v6.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -ldr q4, [x17, #+576] -ldr q5, [x17, #+592] -sqrdmulh v3.4S, v8.4S, v6.s[1] -mul v8.4S, v8.4S,v7.s[1] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v6.s[2] -mul v21.4S, v21.4S,v7.s[2] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v21.4s -add v9.4s, v9.4s, v21.4s -str q10, [x0, #832] -str q3, [x0, #848] -str q9, [x0, #864] -str q8, [x0, #880] -ldr q6, [x0, #928] -ldr q7, [x0, #944] -ldr q8, [x0, #896] -ldr q9, [x0, #912] -sqrdmulh v3.4S, v6.4S, v5.s[0] -mul v6.4S, v6.4S,v4.s[0] -mla v6.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v6.4s -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v5.s[0] -mul v7.4S, v7.4S,v4.s[0] -mla v7.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -ldr q7, [x17, #+608] -ldr q10, [x17, #+624] -sqrdmulh v21.4S, v9.4S, v5.s[1] -mul v9.4S, v9.4S,v4.s[1] -mla v9.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v9.4s -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v6.4S, v5.s[2] -mul v6.4S, v6.4S,v4.s[2] -mla v6.4S, v9.4S, v31.s[0] -sub v9.4s, v3.4s, v6.4s -add v3.4s, v3.4s, v6.4s -str q8, [x0, #896] -str q21, [x0, #912] -str q3, [x0, #928] -str q9, [x0, #944] -ldr q5, [x0, #992] -ldr q4, [x0, #1008] -ldr q9, [x0, #960] -ldr q3, [x0, #976] -sqrdmulh v21.4S, v5.4S, v10.s[0] -mul v5.4S, v5.4S,v7.s[0] -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v10.s[0] -mul v4.4S, v4.4S,v7.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v4.4s -add v3.4s, v3.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v10.s[1] -mul v3.4S, v3.4S,v7.s[1] -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v5.4S, v10.s[2] -mul v5.4S, v5.4S,v7.s[2] -mla v5.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -str q9, [x0, #960] -str q4, [x0, #976] -str q21, [x0, #992] -str q3, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s deleted file mode 100644 index 846ba45..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_0.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0 -.global _ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0 -ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0: -_ntt_u32_incomplete_neon_asm_var_4_2_0_z4_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -ldr q2, [x0, #544] -ldr q1, [x0, #608] -ldr q0, [x0, #672] -ldr q15, [x0, #736] -ldr q14, [x0, #32] -ldr q13, [x0, #96] -ldr q12, [x0, #160] -ldr q11, [x0, #224] -sqrdmulh v10.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v25.s[0] -mul v13.4S, v13.4S,v26.s[0] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v25.s[3] -mul v15.4S, v15.4S,v26.s[3] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v23.s[1] -mul v1.4S, v1.4S,v24.s[1] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v23.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -str q14, [x0, #32] -str q21, [x0, #96] -str q22, [x0, #160] -str q13, [x0, #224] -str q3, [x0, #288] -str q12, [x0, #352] -str q11, [x0, #416] -str q18, [x0, #480] -str q19, [x0, #544] -str q15, [x0, #608] -str q16, [x0, #672] -str q2, [x0, #736] -str q20, [x0, #800] -str q1, [x0, #864] -str q0, [x0, #928] -str q10, [x0, #992] -ldr q10, [x0, #816] -ldr q0, [x0, #880] -ldr q1, [x0, #944] -ldr q20, [x0, #1008] -ldr q2, [x0, #304] -ldr q16, [x0, #368] -ldr q15, [x0, #432] -ldr q19, [x0, #496] -ldr q18, [x0, #560] -ldr q11, [x0, #624] -ldr q12, [x0, #688] -ldr q3, [x0, #752] -ldr q13, [x0, #48] -ldr q22, [x0, #112] -ldr q21, [x0, #176] -ldr q14, [x0, #240] -sqrdmulh v17.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v21.4S, v27.s[0] -mul v21.4S, v21.4S,v28.s[0] -mla v21.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v27.s[3] -mul v16.4S, v16.4S,v28.s[3] -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v27.s[3] -mul v0.4S, v0.4S,v28.s[3] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v25.s[0] -mul v22.4S, v22.4S,v26.s[0] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v26.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v25.s[3] -mul v3.4S, v3.4S,v26.s[3] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v23.s[1] -mul v11.4S, v11.4S,v24.s[1] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v23.s[2] -mul v17.4S, v17.4S,v24.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v23.s[3] -mul v16.4S, v16.4S,v24.s[3] -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -str q13, [x0, #48] -str q0, [x0, #112] -str q10, [x0, #176] -str q22, [x0, #240] -str q19, [x0, #304] -str q21, [x0, #368] -str q14, [x0, #432] -str q2, [x0, #496] -str q20, [x0, #560] -str q3, [x0, #624] -str q15, [x0, #688] -str q18, [x0, #752] -str q1, [x0, #816] -str q11, [x0, #880] -str q12, [x0, #944] -str q17, [x0, #1008] -ldr q17, [x0, #768] -ldr q12, [x0, #832] -ldr q11, [x0, #896] -ldr q1, [x0, #960] -ldr q18, [x0, #256] -ldr q15, [x0, #320] -ldr q3, [x0, #384] -ldr q20, [x0, #448] -ldr q2, [x0, #512] -ldr q14, [x0, #576] -ldr q21, [x0, #640] -ldr q19, [x0, #704] -ldr q22, [x0, #0] -ldr q10, [x0, #64] -ldr q0, [x0, #128] -ldr q13, [x0, #192] -sqrdmulh v16.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v27.s[2] -mul v14.4S, v14.4S,v28.s[2] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v25.s[1] -mul v0.4S, v0.4S,v26.s[1] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v25.s[3] -mul v19.4S, v19.4S,v26.s[3] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v23.s[1] -mul v14.4S, v14.4S,v24.s[1] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -mla v16.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v23.s[3] -mul v15.4S, v15.4S,v24.s[3] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -str q22, [x0, #0] -str q12, [x0, #64] -str q17, [x0, #128] -str q10, [x0, #192] -str q20, [x0, #256] -str q0, [x0, #320] -str q13, [x0, #384] -str q18, [x0, #448] -str q1, [x0, #512] -str q19, [x0, #576] -str q3, [x0, #640] -str q2, [x0, #704] -str q11, [x0, #768] -str q14, [x0, #832] -str q21, [x0, #896] -str q16, [x0, #960] -ldr q16, [x0, #784] -ldr q21, [x0, #848] -ldr q14, [x0, #912] -ldr q11, [x0, #976] -ldr q2, [x0, #272] -ldr q3, [x0, #336] -ldr q19, [x0, #400] -ldr q1, [x0, #464] -ldr q18, [x0, #528] -ldr q13, [x0, #592] -ldr q0, [x0, #656] -ldr q20, [x0, #720] -ldr q10, [x0, #16] -ldr q17, [x0, #80] -ldr q12, [x0, #144] -ldr q22, [x0, #208] -sqrdmulh v15.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -mla v1.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v20.4s -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v26.s[0] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v25.s[3] -mul v20.4S, v20.4S,v26.s[3] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -str q10, [x0, #16] -str q21, [x0, #80] -str q16, [x0, #144] -str q17, [x0, #208] -str q1, [x0, #272] -str q12, [x0, #336] -str q22, [x0, #400] -str q2, [x0, #464] -str q11, [x0, #528] -str q20, [x0, #592] -str q19, [x0, #656] -str q18, [x0, #720] -str q14, [x0, #784] -str q13, [x0, #848] -str q0, [x0, #912] -str q15, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q9, [x17, #+208] -ldr q3, [x17, #+224] -ldr q10, [x17, #+240] -ldr q21, [x0, #32] -ldr q16, [x0, #48] -ldr q17, [x0, #0] -ldr q1, [x0, #16] -sqrdmulh v12.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -ldr q16, [x17, #+256] -ldr q22, [x17, #+272] -sqrdmulh v2.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v4.s[1] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -str q17, [x0, #0] -str q2, [x0, #16] -str q12, [x0, #32] -str q1, [x0, #48] -ldr q1, [x0, #96] -ldr q12, [x0, #112] -ldr q2, [x0, #64] -ldr q17, [x0, #80] -sqrdmulh v21.4S, v1.4S, v7.s[0] -mul v1.4S, v1.4S,v6.s[0] -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v7.s[0] -mul v12.4S, v12.4S,v6.s[0] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -ldr q12, [x17, #+288] -ldr q11, [x17, #+304] -sqrdmulh v20.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v6.s[1] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v7.s[2] -mul v1.4S, v1.4S,v6.s[2] -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -str q2, [x0, #64] -str q20, [x0, #80] -str q21, [x0, #96] -str q17, [x0, #112] -ldr q17, [x0, #160] -ldr q21, [x0, #176] -ldr q20, [x0, #128] -ldr q2, [x0, #144] -sqrdmulh v1.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v8.s[0] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v8.s[0] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -ldr q21, [x17, #+320] -ldr q19, [x17, #+336] -sqrdmulh v18.4S, v2.4S, v9.s[1] -mul v2.4S, v2.4S,v8.s[1] -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v17.4S, v9.s[2] -mul v17.4S, v17.4S,v8.s[2] -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -str q20, [x0, #128] -str q18, [x0, #144] -str q1, [x0, #160] -str q2, [x0, #176] -ldr q2, [x0, #224] -ldr q1, [x0, #240] -ldr q18, [x0, #192] -ldr q20, [x0, #208] -sqrdmulh v17.4S, v2.4S, v10.s[0] -mul v2.4S, v2.4S,v3.s[0] -mla v2.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v10.s[0] -mul v1.4S, v1.4S,v3.s[0] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -ldr q1, [x17, #+352] -ldr q14, [x17, #+368] -sqrdmulh v13.4S, v20.4S, v10.s[1] -mul v20.4S, v20.4S,v3.s[1] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v10.s[2] -mul v2.4S, v2.4S,v3.s[2] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -str q18, [x0, #192] -str q13, [x0, #208] -str q17, [x0, #224] -str q20, [x0, #240] -ldr q20, [x0, #288] -ldr q17, [x0, #304] -ldr q13, [x0, #256] -ldr q18, [x0, #272] -sqrdmulh v2.4S, v20.4S, v22.s[0] -mul v20.4S, v20.4S,v16.s[0] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v22.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -ldr q17, [x17, #+384] -ldr q0, [x17, #+400] -sqrdmulh v15.4S, v18.4S, v22.s[1] -mul v18.4S, v18.4S,v16.s[1] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v22.s[2] -mul v20.4S, v20.4S,v16.s[2] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v20.4s -add v2.4s, v2.4s, v20.4s -str q13, [x0, #256] -str q15, [x0, #272] -str q2, [x0, #288] -str q18, [x0, #304] -ldr q5, [x0, #352] -ldr q4, [x0, #368] -ldr q18, [x0, #320] -ldr q2, [x0, #336] -sqrdmulh v15.4S, v5.4S, v11.s[0] -mul v5.4S, v5.4S,v12.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v5.4s -add v18.4s, v18.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v11.s[0] -mul v4.4S, v4.4S,v12.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -ldr q4, [x17, #+416] -ldr q13, [x17, #+432] -sqrdmulh v20.4S, v2.4S, v11.s[1] -mul v2.4S, v2.4S,v12.s[1] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v5.4S, v11.s[2] -mul v5.4S, v5.4S,v12.s[2] -mla v5.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -str q18, [x0, #320] -str q20, [x0, #336] -str q15, [x0, #352] -str q2, [x0, #368] -ldr q7, [x0, #416] -ldr q6, [x0, #432] -ldr q2, [x0, #384] -ldr q15, [x0, #400] -sqrdmulh v20.4S, v7.4S, v19.s[0] -mul v7.4S, v7.4S,v21.s[0] -mla v7.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v6.4S, v19.s[0] -mul v6.4S, v6.4S,v21.s[0] -mla v6.4S, v7.4S, v31.s[0] -sub v7.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -ldr q6, [x17, #+448] -ldr q18, [x17, #+464] -sqrdmulh v5.4S, v15.4S, v19.s[1] -mul v15.4S, v15.4S,v21.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v7.4S, v19.s[2] -mul v7.4S, v7.4S,v21.s[2] -mla v7.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v7.4s -add v20.4s, v20.4s, v7.4s -str q2, [x0, #384] -str q5, [x0, #400] -str q20, [x0, #416] -str q15, [x0, #432] -ldr q9, [x0, #480] -ldr q8, [x0, #496] -ldr q15, [x0, #448] -ldr q20, [x0, #464] -sqrdmulh v5.4S, v9.4S, v14.s[0] -mul v9.4S, v9.4S,v1.s[0] -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v14.s[0] -mul v8.4S, v8.4S,v1.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v8.4s -add v20.4s, v20.4s, v8.4s -ldr q8, [x17, #+480] -ldr q2, [x17, #+496] -sqrdmulh v7.4S, v20.4S, v14.s[1] -mul v20.4S, v20.4S,v1.s[1] -mla v20.4S, v7.4S, v31.s[0] -sub v7.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v14.s[2] -mul v9.4S, v9.4S,v1.s[2] -mla v9.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -str q15, [x0, #448] -str q7, [x0, #464] -str q5, [x0, #480] -str q20, [x0, #496] -ldr q10, [x0, #544] -ldr q3, [x0, #560] -ldr q20, [x0, #512] -ldr q5, [x0, #528] -sqrdmulh v7.4S, v10.4S, v0.s[0] -mul v10.4S, v10.4S,v17.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v17.s[0] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v3.4s -add v5.4s, v5.4s, v3.4s -ldr q3, [x17, #+512] -ldr q15, [x17, #+528] -sqrdmulh v9.4S, v5.4S, v0.s[1] -mul v5.4S, v5.4S,v17.s[1] -mla v5.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v0.s[2] -mul v10.4S, v10.4S,v17.s[2] -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -str q20, [x0, #512] -str q9, [x0, #528] -str q7, [x0, #544] -str q5, [x0, #560] -ldr q22, [x0, #608] -ldr q16, [x0, #624] -ldr q5, [x0, #576] -ldr q7, [x0, #592] -sqrdmulh v9.4S, v22.4S, v13.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v22.4s -add v5.4s, v5.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v13.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -ldr q16, [x17, #+544] -ldr q20, [x17, #+560] -sqrdmulh v10.4S, v7.4S, v13.s[1] -mul v7.4S, v7.4S,v4.s[1] -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v22.4S, v13.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v22.4s -add v9.4s, v9.4s, v22.4s -str q5, [x0, #576] -str q10, [x0, #592] -str q9, [x0, #608] -str q7, [x0, #624] -ldr q11, [x0, #672] -ldr q12, [x0, #688] -ldr q7, [x0, #640] -ldr q9, [x0, #656] -sqrdmulh v10.4S, v11.4S, v18.s[0] -mul v11.4S, v11.4S,v6.s[0] -mla v11.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v11.4s -add v7.4s, v7.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v18.s[0] -mul v12.4S, v12.4S,v6.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -ldr q12, [x17, #+576] -ldr q5, [x17, #+592] -sqrdmulh v22.4S, v9.4S, v18.s[1] -mul v9.4S, v9.4S,v6.s[1] -mla v9.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v18.s[2] -mul v11.4S, v11.4S,v6.s[2] -mla v11.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v11.4s -add v10.4s, v10.4s, v11.4s -str q7, [x0, #640] -str q22, [x0, #656] -str q10, [x0, #672] -str q9, [x0, #688] -ldr q19, [x0, #736] -ldr q21, [x0, #752] -ldr q9, [x0, #704] -ldr q10, [x0, #720] -sqrdmulh v22.4S, v19.4S, v2.s[0] -mul v19.4S, v19.4S,v8.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v19.4s -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v2.s[0] -mul v21.4S, v21.4S,v8.s[0] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -ldr q21, [x17, #+608] -ldr q7, [x17, #+624] -sqrdmulh v11.4S, v10.4S, v2.s[1] -mul v10.4S, v10.4S,v8.s[1] -mla v10.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v2.s[2] -mul v19.4S, v19.4S,v8.s[2] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -str q9, [x0, #704] -str q11, [x0, #720] -str q22, [x0, #736] -str q10, [x0, #752] -ldr q14, [x0, #800] -ldr q1, [x0, #816] -ldr q10, [x0, #768] -ldr q22, [x0, #784] -sqrdmulh v11.4S, v14.4S, v15.s[0] -mul v14.4S, v14.4S,v3.s[0] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v1.4S, v15.s[0] -mul v1.4S, v1.4S,v3.s[0] -mla v1.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v15.s[1] -mul v22.4S, v22.4S,v3.s[1] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v15.s[2] -mul v14.4S, v14.4S,v3.s[2] -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -str q10, [x0, #768] -str q1, [x0, #784] -str q11, [x0, #800] -str q22, [x0, #816] -ldr q0, [x0, #864] -ldr q17, [x0, #880] -ldr q22, [x0, #832] -ldr q11, [x0, #848] -sqrdmulh v1.4S, v0.4S, v20.s[0] -mul v0.4S, v0.4S,v16.s[0] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v20.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v20.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v0.4S, v20.s[2] -mul v0.4S, v0.4S,v16.s[2] -mla v0.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v0.4s -add v1.4s, v1.4s, v0.4s -str q22, [x0, #832] -str q17, [x0, #848] -str q1, [x0, #864] -str q11, [x0, #880] -ldr q13, [x0, #928] -ldr q4, [x0, #944] -ldr q11, [x0, #896] -ldr q1, [x0, #912] -sqrdmulh v17.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v5.s[0] -mul v4.4S, v4.4S,v12.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -sqrdmulh v4.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v12.s[1] -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v12.s[2] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -str q11, [x0, #896] -str q4, [x0, #912] -str q17, [x0, #928] -str q1, [x0, #944] -ldr q18, [x0, #992] -ldr q6, [x0, #1008] -ldr q1, [x0, #960] -ldr q17, [x0, #976] -sqrdmulh v4.4S, v18.4S, v7.s[0] -mul v18.4S, v18.4S,v21.s[0] -mla v18.4S, v4.4S, v31.s[0] -sub v4.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v6.4S, v7.s[0] -mul v6.4S, v6.4S,v21.s[0] -mla v6.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v6.4s -add v17.4s, v17.4s, v6.4s -sqrdmulh v6.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v21.s[1] -mla v17.4S, v6.4S, v31.s[0] -sub v6.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v7.s[2] -mul v18.4S, v18.4S,v21.s[2] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v4.4s, v18.4s -add v4.4s, v4.4s, v18.4s -str q1, [x0, #960] -str q6, [x0, #976] -str q4, [x0, #992] -str q17, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s deleted file mode 100644 index 1443403..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_0_z4_16.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16 -.global _ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16 -ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16: -_ntt_u32_incomplete_neon_asm_var_4_2_0_z4_16: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -ldr q2, [x0, #544] -ldr q1, [x0, #608] -ldr q0, [x0, #672] -ldr q15, [x0, #736] -ldr q14, [x0, #32] -ldr q13, [x0, #96] -ldr q12, [x0, #160] -ldr q11, [x0, #224] -sqrdmulh v10.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v25.s[0] -mul v13.4S, v13.4S,v26.s[0] -mla v13.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v25.s[3] -mul v15.4S, v15.4S,v26.s[3] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v23.s[1] -mul v1.4S, v1.4S,v24.s[1] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v23.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -str q14, [x0, #32] -str q21, [x0, #96] -str q22, [x0, #160] -str q13, [x0, #224] -str q3, [x0, #288] -str q12, [x0, #352] -str q11, [x0, #416] -str q18, [x0, #480] -str q19, [x0, #544] -str q15, [x0, #608] -str q16, [x0, #672] -str q2, [x0, #736] -str q20, [x0, #800] -str q1, [x0, #864] -str q0, [x0, #928] -str q10, [x0, #992] -ldr q10, [x0, #816] -ldr q0, [x0, #880] -ldr q1, [x0, #944] -ldr q20, [x0, #1008] -ldr q2, [x0, #304] -ldr q16, [x0, #368] -ldr q15, [x0, #432] -ldr q19, [x0, #496] -ldr q18, [x0, #560] -ldr q11, [x0, #624] -ldr q12, [x0, #688] -ldr q3, [x0, #752] -ldr q13, [x0, #48] -ldr q22, [x0, #112] -ldr q21, [x0, #176] -ldr q14, [x0, #240] -sqrdmulh v17.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v21.4S, v27.s[0] -mul v21.4S, v21.4S,v28.s[0] -mla v21.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v11.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v27.s[3] -mul v16.4S, v16.4S,v28.s[3] -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v27.s[3] -mul v0.4S, v0.4S,v28.s[3] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v25.s[0] -mul v22.4S, v22.4S,v26.s[0] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v26.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v25.s[3] -mul v3.4S, v3.4S,v26.s[3] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v23.s[1] -mul v11.4S, v11.4S,v24.s[1] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v23.s[2] -mul v17.4S, v17.4S,v24.s[2] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v23.s[3] -mul v16.4S, v16.4S,v24.s[3] -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -str q13, [x0, #48] -str q0, [x0, #112] -str q10, [x0, #176] -str q22, [x0, #240] -str q19, [x0, #304] -str q21, [x0, #368] -str q14, [x0, #432] -str q2, [x0, #496] -str q20, [x0, #560] -str q3, [x0, #624] -str q15, [x0, #688] -str q18, [x0, #752] -str q1, [x0, #816] -str q11, [x0, #880] -str q12, [x0, #944] -str q17, [x0, #1008] -ldr q17, [x0, #768] -ldr q12, [x0, #832] -ldr q11, [x0, #896] -ldr q1, [x0, #960] -ldr q18, [x0, #256] -ldr q15, [x0, #320] -ldr q3, [x0, #384] -ldr q20, [x0, #448] -ldr q2, [x0, #512] -ldr q14, [x0, #576] -ldr q21, [x0, #640] -ldr q19, [x0, #704] -ldr q22, [x0, #0] -ldr q10, [x0, #64] -ldr q0, [x0, #128] -ldr q13, [x0, #192] -sqrdmulh v16.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -mla v11.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v27.s[2] -mul v14.4S, v14.4S,v28.s[2] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -mla v21.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v25.s[1] -mul v0.4S, v0.4S,v26.s[1] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v25.s[3] -mul v19.4S, v19.4S,v26.s[3] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v23.s[1] -mul v14.4S, v14.4S,v24.s[1] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -mla v16.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v23.s[3] -mul v15.4S, v15.4S,v24.s[3] -mla v15.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -str q22, [x0, #0] -str q12, [x0, #64] -str q17, [x0, #128] -str q10, [x0, #192] -str q20, [x0, #256] -str q0, [x0, #320] -str q13, [x0, #384] -str q18, [x0, #448] -str q1, [x0, #512] -str q19, [x0, #576] -str q3, [x0, #640] -str q2, [x0, #704] -str q11, [x0, #768] -str q14, [x0, #832] -str q21, [x0, #896] -str q16, [x0, #960] -ldr q16, [x0, #784] -ldr q21, [x0, #848] -ldr q14, [x0, #912] -ldr q11, [x0, #976] -ldr q2, [x0, #272] -ldr q3, [x0, #336] -ldr q19, [x0, #400] -ldr q1, [x0, #464] -ldr q18, [x0, #528] -ldr q13, [x0, #592] -ldr q0, [x0, #656] -ldr q20, [x0, #720] -ldr q10, [x0, #16] -ldr q17, [x0, #80] -ldr q12, [x0, #144] -ldr q22, [x0, #208] -sqrdmulh v15.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -mla v1.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v20.4s -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v26.s[0] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sqrdmulh v2.4S, v20.4S, v25.s[3] -mul v20.4S, v20.4S,v26.s[3] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -str q10, [x0, #16] -str q21, [x0, #80] -str q16, [x0, #144] -str q17, [x0, #208] -str q1, [x0, #272] -str q12, [x0, #336] -str q22, [x0, #400] -str q2, [x0, #464] -str q11, [x0, #528] -str q20, [x0, #592] -str q19, [x0, #656] -str q18, [x0, #720] -str q14, [x0, #784] -str q13, [x0, #848] -str q0, [x0, #912] -str q15, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q9, [x17, #+208] -ldr q3, [x17, #+224] -ldr q10, [x17, #+240] -ldr q21, [x0, #32] -ldr q16, [x0, #48] -ldr q17, [x0, #0] -ldr q1, [x0, #16] -ldr q12, [x17, #+256] -ldr q22, [x17, #+272] -sqrdmulh v2.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sub v21.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v4.s[1] -mla v1.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -sub v1.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -str q17, [x0, #0] -str q16, [x0, #16] -str q2, [x0, #32] -str q1, [x0, #48] -ldr q1, [x0, #96] -ldr q2, [x0, #112] -ldr q16, [x0, #64] -ldr q17, [x0, #80] -ldr q21, [x17, #+288] -ldr q11, [x17, #+304] -sqrdmulh v20.4S, v1.4S, v7.s[0] -mul v1.4S, v1.4S,v6.s[0] -mla v1.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v2.4S, v7.s[0] -mul v2.4S, v2.4S,v6.s[0] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sub v1.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v6.s[1] -mla v17.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v1.4S, v7.s[2] -mul v1.4S, v1.4S,v6.s[2] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sub v17.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -str q16, [x0, #64] -str q2, [x0, #80] -str q20, [x0, #96] -str q17, [x0, #112] -ldr q17, [x0, #160] -ldr q20, [x0, #176] -ldr q2, [x0, #128] -ldr q16, [x0, #144] -ldr q1, [x17, #+320] -ldr q19, [x17, #+336] -sqrdmulh v18.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v8.s[0] -mla v17.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v20.4S, v9.s[0] -mul v20.4S, v20.4S,v8.s[0] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sub v17.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v8.s[1] -mla v16.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v17.4S, v9.s[2] -mul v17.4S, v17.4S,v8.s[2] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -str q2, [x0, #128] -str q20, [x0, #144] -str q18, [x0, #160] -str q16, [x0, #176] -ldr q16, [x0, #224] -ldr q18, [x0, #240] -ldr q20, [x0, #192] -ldr q2, [x0, #208] -ldr q17, [x17, #+352] -ldr q14, [x17, #+368] -sqrdmulh v13.4S, v16.4S, v10.s[0] -mul v16.4S, v16.4S,v3.s[0] -mla v16.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v18.4S, v10.s[0] -mul v18.4S, v18.4S,v3.s[0] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sub v16.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v10.s[1] -mul v2.4S, v2.4S,v3.s[1] -mla v2.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v16.4S, v10.s[2] -mul v16.4S, v16.4S,v3.s[2] -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sub v2.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -str q20, [x0, #192] -str q18, [x0, #208] -str q13, [x0, #224] -str q2, [x0, #240] -ldr q2, [x0, #288] -ldr q13, [x0, #304] -ldr q18, [x0, #256] -ldr q20, [x0, #272] -ldr q16, [x17, #+384] -ldr q0, [x17, #+400] -sqrdmulh v15.4S, v2.4S, v22.s[0] -mul v2.4S, v2.4S,v12.s[0] -mla v2.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v13.4S, v22.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sub v2.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v22.s[1] -mul v20.4S, v20.4S,v12.s[1] -mla v20.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v2.4S, v22.s[2] -mul v2.4S, v2.4S,v12.s[2] -mla v2.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sub v20.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -str q18, [x0, #256] -str q13, [x0, #272] -str q15, [x0, #288] -str q20, [x0, #304] -ldr q5, [x0, #352] -ldr q4, [x0, #368] -ldr q20, [x0, #320] -ldr q15, [x0, #336] -ldr q13, [x17, #+416] -ldr q18, [x17, #+432] -sqrdmulh v2.4S, v5.4S, v11.s[0] -mul v5.4S, v5.4S,v21.s[0] -mla v5.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v4.4S, v11.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sub v5.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v15.4S, v11.s[1] -mul v15.4S, v15.4S,v21.s[1] -mla v15.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v5.4S, v11.s[2] -mul v5.4S, v5.4S,v21.s[2] -mla v5.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sub v15.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -str q20, [x0, #320] -str q4, [x0, #336] -str q2, [x0, #352] -str q15, [x0, #368] -ldr q7, [x0, #416] -ldr q6, [x0, #432] -ldr q15, [x0, #384] -ldr q2, [x0, #400] -ldr q4, [x17, #+448] -ldr q20, [x17, #+464] -sqrdmulh v5.4S, v7.4S, v19.s[0] -mul v7.4S, v7.4S,v1.s[0] -mla v7.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v6.4S, v19.s[0] -mul v6.4S, v6.4S,v1.s[0] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v7.4s -add v15.4s, v15.4s, v7.4s -sub v7.4s, v2.4s, v6.4s -add v2.4s, v2.4s, v6.4s -sqrdmulh v6.4S, v2.4S, v19.s[1] -mul v2.4S, v2.4S,v1.s[1] -mla v2.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v7.4S, v19.s[2] -mul v7.4S, v7.4S,v1.s[2] -mla v7.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sub v2.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -str q15, [x0, #384] -str q6, [x0, #400] -str q5, [x0, #416] -str q2, [x0, #432] -ldr q9, [x0, #480] -ldr q8, [x0, #496] -ldr q2, [x0, #448] -ldr q5, [x0, #464] -ldr q6, [x17, #+480] -ldr q15, [x17, #+496] -sqrdmulh v7.4S, v9.4S, v14.s[0] -mul v9.4S, v9.4S,v17.s[0] -mla v9.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v8.4S, v14.s[0] -mul v8.4S, v8.4S,v17.s[0] -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sub v9.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v14.s[1] -mul v5.4S, v5.4S,v17.s[1] -mla v5.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v9.4S, v14.s[2] -mul v9.4S, v9.4S,v17.s[2] -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -sub v5.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -str q2, [x0, #448] -str q8, [x0, #464] -str q7, [x0, #480] -str q5, [x0, #496] -ldr q10, [x0, #544] -ldr q3, [x0, #560] -ldr q5, [x0, #512] -ldr q7, [x0, #528] -ldr q8, [x17, #+512] -ldr q2, [x17, #+528] -sqrdmulh v9.4S, v10.4S, v0.s[0] -mul v10.4S, v10.4S,v16.s[0] -mla v10.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v16.s[0] -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sub v10.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -sqrdmulh v3.4S, v7.4S, v0.s[1] -mul v7.4S, v7.4S,v16.s[1] -mla v7.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v10.4S, v0.s[2] -mul v10.4S, v10.4S,v16.s[2] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -str q5, [x0, #512] -str q3, [x0, #528] -str q9, [x0, #544] -str q7, [x0, #560] -ldr q22, [x0, #608] -ldr q12, [x0, #624] -ldr q7, [x0, #576] -ldr q9, [x0, #592] -ldr q3, [x17, #+544] -ldr q5, [x17, #+560] -sqrdmulh v10.4S, v22.4S, v18.s[0] -mul v22.4S, v22.4S,v13.s[0] -mla v22.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v12.4S, v18.s[0] -mul v12.4S, v12.4S,v13.s[0] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v22.4s -add v7.4s, v7.4s, v22.4s -sub v22.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v18.s[1] -mul v9.4S, v9.4S,v13.s[1] -mla v9.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v22.4S, v18.s[2] -mul v22.4S, v22.4S,v13.s[2] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -sub v9.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -str q7, [x0, #576] -str q12, [x0, #592] -str q10, [x0, #608] -str q9, [x0, #624] -ldr q11, [x0, #672] -ldr q21, [x0, #688] -ldr q9, [x0, #640] -ldr q10, [x0, #656] -ldr q12, [x17, #+576] -ldr q7, [x17, #+592] -sqrdmulh v22.4S, v11.4S, v20.s[0] -mul v11.4S, v11.4S,v4.s[0] -mla v11.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v21.4S, v20.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v11.4s -add v9.4s, v9.4s, v11.4s -sub v11.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v20.s[1] -mul v10.4S, v10.4S,v4.s[1] -mla v10.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v11.4S, v20.s[2] -mul v11.4S, v11.4S,v4.s[2] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sub v10.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -str q9, [x0, #640] -str q21, [x0, #656] -str q22, [x0, #672] -str q10, [x0, #688] -ldr q19, [x0, #736] -ldr q1, [x0, #752] -ldr q10, [x0, #704] -ldr q22, [x0, #720] -ldr q21, [x17, #+608] -ldr q9, [x17, #+624] -sqrdmulh v11.4S, v19.4S, v15.s[0] -mul v19.4S, v19.4S,v6.s[0] -mla v19.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v1.4S, v15.s[0] -mul v1.4S, v1.4S,v6.s[0] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v19.4s -add v10.4s, v10.4s, v19.4s -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v15.s[1] -mul v22.4S, v22.4S,v6.s[1] -mla v22.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v19.4S, v15.s[2] -mul v19.4S, v19.4S,v6.s[2] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sub v22.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -str q10, [x0, #704] -str q1, [x0, #720] -str q11, [x0, #736] -str q22, [x0, #752] -ldr q14, [x0, #800] -ldr q17, [x0, #816] -ldr q22, [x0, #768] -ldr q11, [x0, #784] -sqrdmulh v1.4S, v14.4S, v2.s[0] -mul v14.4S, v14.4S,v8.s[0] -mla v14.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v8.s[0] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v8.s[1] -mla v11.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v14.4S, v2.s[2] -mul v14.4S, v14.4S,v8.s[2] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sub v11.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -str q22, [x0, #768] -str q17, [x0, #784] -str q1, [x0, #800] -str q11, [x0, #816] -ldr q0, [x0, #864] -ldr q16, [x0, #880] -ldr q11, [x0, #832] -ldr q1, [x0, #848] -sqrdmulh v17.4S, v0.4S, v5.s[0] -mul v0.4S, v0.4S,v3.s[0] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v3.s[0] -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sub v0.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v3.s[1] -mla v1.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v0.4S, v5.s[2] -mul v0.4S, v0.4S,v3.s[2] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -sub v1.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -str q11, [x0, #832] -str q16, [x0, #848] -str q17, [x0, #864] -str q1, [x0, #880] -ldr q18, [x0, #928] -ldr q13, [x0, #944] -ldr q1, [x0, #896] -ldr q17, [x0, #912] -sqrdmulh v16.4S, v18.4S, v7.s[0] -mul v18.4S, v18.4S,v12.s[0] -mla v18.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v13.4S, v7.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v12.s[1] -mla v17.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v18.4S, v7.s[2] -mul v18.4S, v18.4S,v12.s[2] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sub v17.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -str q1, [x0, #896] -str q13, [x0, #912] -str q16, [x0, #928] -str q17, [x0, #944] -ldr q20, [x0, #992] -ldr q4, [x0, #1008] -ldr q17, [x0, #960] -ldr q16, [x0, #976] -sqrdmulh v13.4S, v20.4S, v9.s[0] -mul v20.4S, v20.4S,v21.s[0] -mla v20.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v4.4S, v9.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sub v20.4s, v16.4s, v4.4s -add v16.4s, v16.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v21.s[1] -mla v16.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v20.4S, v9.s[2] -mul v20.4S, v20.4S,v21.s[2] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sub v16.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -str q17, [x0, #960] -str q4, [x0, #976] -str q13, [x0, #992] -str q16, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s deleted file mode 100644 index 7075abc..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_10_z4_7.s +++ /dev/null @@ -1,1558 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_10_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -nop -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -nop -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -nop -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -nop -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -nop -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -nop -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -nop -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -nop -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -nop -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -nop -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -nop -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -nop -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -nop -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -nop -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -nop -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -nop -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -nop -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -nop -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -nop -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -nop -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -nop -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -nop -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -nop -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -nop -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -nop -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -nop -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -nop -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -nop -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -nop -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -nop -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -nop -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -nop -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -nop -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -nop -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -nop -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -nop -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -nop -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -nop -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -nop -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -nop -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -nop -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -nop -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -nop -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -nop -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -nop -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -nop -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -nop -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -nop -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -nop -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -nop -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -nop -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -nop -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -nop -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -nop -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -nop -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -nop -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -nop -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -nop -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -nop -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -nop -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -nop -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -nop -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -nop -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -nop -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #224] -ldr q1, [x0, #160] -ldr q10, [x0, #32] -ldr q21, [x17, #+128] -ldr q22, [x17, #+144] -sqrdmulh v15.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v21.s[0] -ldr q3, [x0, #48] -sqrdmulh v12.4S, v3.4S, v22.s[0] -mul v3.4S, v3.4S,v21.s[0] -ldr q4, [x17, #+160] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v4.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v4.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v1.4S, v24.s[0] -ldr q20, [x0, #176] -mla v3.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v20.4S, v24.s[0] -ldr q6, [x17, #+224] -ldr q5, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v18.4S, v5.s[0] -ldr q16, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v16.4S, v5.s[0] -ldr q11, [x0, #0] -ldr q2, [x0, #128] -mul v1.4S, v1.4S,v25.s[0] -sub v23.4s, v11.4s, v10.4s -ldr q0, [x0, #16] -mul v20.4S, v20.4S,v25.s[0] -add v11.4s, v11.4s, v10.4s -ldr q10, [x0, #144] -mla v1.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -ldr q13, [x0, #64] -mla v20.4S, v12.4S, v31.s[0] -add v0.4s, v0.4s, v3.4s -ldr q3, [x0, #192] -mul v18.4S, v18.4S,v6.s[0] -sub v12.4s, v13.4s, v29.4s -ldr q14, [x0, #80] -mul v16.4S, v16.4S,v6.s[0] -add v13.4s, v13.4s, v29.4s -ldr q29, [x0, #208] -mla v18.4S, v28.4S, v31.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v27.4s -sqrdmulh v28.4S, v0.4S, v22.s[1] -add v14.4s, v14.4s, v27.4s -mul v0.4S, v0.4S,v21.s[1] -sqrdmulh v27.4S, v15.4S, v22.s[2] -sub v19.4s, v2.4s, v1.4s -mul v15.4S, v15.4S,v21.s[2] -add v2.4s, v2.4s, v1.4s -sqrdmulh v22.4S, v14.4S, v30.s[1] -sub v21.4s, v10.4s, v20.4s -mul v14.4S, v14.4S,v4.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v30.s[2] -sub v1.4s, v3.4s, v18.4s -mul v26.4S, v26.4S,v4.s[2] -add v3.4s, v3.4s, v18.4s -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v16.4s -ldr q30, [x0, #480] -sqrdmulh v4.4S, v10.4S, v24.s[1] -add v29.4s, v29.4s, v16.4s -mla v15.4S, v27.4S, v31.s[0] -ldr q27, [x0, #416] -sqrdmulh v16.4S, v21.4S, v24.s[2] -sub v18.4s, v11.4s, v0.4s -mla v14.4S, v22.4S, v31.s[0] -ldr q22, [x0, #288] -sqrdmulh v17.4S, v29.4S, v5.s[1] -add v11.4s, v11.4s, v0.4s -str q18, [x0, #16] -mla v26.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -ldr q18, [x17, #+272] -sqrdmulh v0.4S, v28.4S, v5.s[2] -sub v7.4s, v23.4s, v15.4s -str q11, [x0, #0] -mul v10.4S, v10.4S,v25.s[1] -add v23.4s, v23.4s, v15.4s -mul v21.4S, v21.4S,v25.s[2] -str q7, [x0, #48] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v14.4s -mla v21.4S, v16.4S, v31.s[0] -str q23, [x0, #32] -mul v29.4S, v29.4S,v6.s[1] -str q4, [x0, #80] -mul v28.4S, v28.4S,v6.s[2] -add v13.4s, v13.4s, v14.4s -str q13, [x0, #64] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v26.4s -str q17, [x0, #112] -mla v28.4S, v0.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -str q12, [x0, #96] -sqrdmulh v5.4S, v22.4S, v18.s[0] -sub v6.4s, v2.4s, v10.4s -mul v22.4S, v22.4S,v20.s[0] -str q6, [x0, #144] -ldr q6, [x0, #304] -sqrdmulh v12.4S, v6.4S, v18.s[0] -add v2.4s, v2.4s, v10.4s -mul v6.4S, v6.4S,v20.s[0] -str q2, [x0, #128] -ldr q2, [x17, #+288] -ldr q10, [x17, #+304] -ldr q26, [x0, #352] -sqrdmulh v0.4S, v26.4S, v10.s[0] -sub v17.4s, v19.4s, v21.4s -mul v26.4S, v26.4S,v2.s[0] -str q17, [x0, #176] -ldr q17, [x0, #368] -sqrdmulh v13.4S, v17.4S, v10.s[0] -add v19.4s, v19.4s, v21.4s -mul v17.4S, v17.4S,v2.s[0] -str q19, [x0, #160] -ldr q19, [x17, #+320] -ldr q21, [x17, #+336] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v29.4s -sqrdmulh v14.4S, v27.4S, v21.s[0] -str q5, [x0, #208] -ldr q5, [x0, #432] -mla v6.4S, v12.4S, v31.s[0] -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v5.4S, v21.s[0] -str q3, [x0, #192] -ldr q3, [x17, #+352] -ldr q12, [x17, #+368] -mla v26.4S, v0.4S, v31.s[0] -sub v0.4s, v1.4s, v28.4s -sqrdmulh v4.4S, v30.4S, v12.s[0] -str q0, [x0, #240] -ldr q0, [x0, #496] -mla v17.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v0.4S, v12.s[0] -str q1, [x0, #224] -ldr q1, [x0, #256] -ldr q13, [x0, #384] -mul v27.4S, v27.4S,v19.s[0] -sub v24.4s, v1.4s, v22.4s -ldr q25, [x0, #272] -mul v5.4S, v5.4S,v19.s[0] -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #400] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v25.4s, v6.4s -ldr q23, [x0, #320] -mla v5.4S, v29.4S, v31.s[0] -add v25.4s, v25.4s, v6.4s -ldr q6, [x0, #448] -mul v30.4S, v30.4S,v3.s[0] -sub v29.4s, v23.4s, v26.4s -ldr q16, [x0, #336] -mul v0.4S, v0.4S,v3.s[0] -add v23.4s, v23.4s, v26.4s -ldr q26, [x0, #464] -mla v30.4S, v4.4S, v31.s[0] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v17.4s -sqrdmulh v4.4S, v25.4S, v18.s[1] -add v16.4s, v16.4s, v17.4s -mul v25.4S, v25.4S,v20.s[1] -sqrdmulh v17.4S, v14.4S, v18.s[2] -sub v7.4s, v13.4s, v27.4s -mul v14.4S, v14.4S,v20.s[2] -add v13.4s, v13.4s, v27.4s -sqrdmulh v18.4S, v16.4S, v10.s[1] -sub v20.4s, v22.4s, v5.4s -mul v16.4S, v16.4S,v2.s[1] -add v22.4s, v22.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v10.s[2] -sub v27.4s, v6.4s, v30.4s -mul v28.4S, v28.4S,v2.s[2] -add v6.4s, v6.4s, v30.4s -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v0.4s -ldr q10, [x0, #736] -sqrdmulh v2.4S, v22.4S, v21.s[1] -add v26.4s, v26.4s, v0.4s -mla v14.4S, v17.4S, v31.s[0] -ldr q17, [x0, #672] -sqrdmulh v0.4S, v20.4S, v21.s[2] -sub v30.4s, v1.4s, v25.4s -mla v16.4S, v18.4S, v31.s[0] -ldr q18, [x0, #544] -sqrdmulh v15.4S, v26.4S, v12.s[1] -add v1.4s, v1.4s, v25.4s -str q30, [x0, #272] -mla v28.4S, v5.4S, v31.s[0] -ldr q5, [x17, #+384] -ldr q30, [x17, #+400] -sqrdmulh v25.4S, v4.4S, v12.s[2] -sub v11.4s, v24.4s, v14.4s -str q1, [x0, #256] -mul v22.4S, v22.4S,v19.s[1] -add v24.4s, v24.4s, v14.4s -mul v20.4S, v20.4S,v19.s[2] -str q11, [x0, #304] -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v16.4s -mla v20.4S, v0.4S, v31.s[0] -str q24, [x0, #288] -mul v26.4S, v26.4S,v3.s[1] -str q2, [x0, #336] -mul v4.4S, v4.4S,v3.s[2] -add v23.4s, v23.4s, v16.4s -str q23, [x0, #320] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v28.4s -str q15, [x0, #368] -mla v4.4S, v25.4S, v31.s[0] -add v29.4s, v29.4s, v28.4s -str q29, [x0, #352] -sqrdmulh v12.4S, v18.4S, v30.s[0] -sub v3.4s, v13.4s, v22.4s -mul v18.4S, v18.4S,v5.s[0] -str q3, [x0, #400] -ldr q3, [x0, #560] -sqrdmulh v29.4S, v3.4S, v30.s[0] -add v13.4s, v13.4s, v22.4s -mul v3.4S, v3.4S,v5.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+416] -ldr q22, [x17, #+432] -ldr q28, [x0, #608] -sqrdmulh v25.4S, v28.4S, v22.s[0] -sub v15.4s, v7.4s, v20.4s -mul v28.4S, v28.4S,v13.s[0] -str q15, [x0, #432] -ldr q15, [x0, #624] -sqrdmulh v23.4S, v15.4S, v22.s[0] -add v7.4s, v7.4s, v20.4s -mul v15.4S, v15.4S,v13.s[0] -str q7, [x0, #416] -ldr q7, [x17, #+448] -ldr q20, [x17, #+464] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v26.4s -sqrdmulh v16.4S, v17.4S, v20.s[0] -str q12, [x0, #464] -ldr q12, [x0, #688] -mla v3.4S, v29.4S, v31.s[0] -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v20.s[0] -str q6, [x0, #448] -ldr q6, [x17, #+480] -ldr q29, [x17, #+496] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v4.4s -sqrdmulh v2.4S, v10.4S, v29.s[0] -str q25, [x0, #496] -ldr q25, [x0, #752] -mla v15.4S, v23.4S, v31.s[0] -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v29.s[0] -str q27, [x0, #480] -ldr q27, [x0, #512] -ldr q23, [x0, #640] -mul v17.4S, v17.4S,v7.s[0] -sub v21.4s, v27.4s, v18.4s -ldr q19, [x0, #528] -mul v12.4S, v12.4S,v7.s[0] -add v27.4s, v27.4s, v18.4s -ldr q18, [x0, #656] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v3.4s -ldr q24, [x0, #576] -mla v12.4S, v26.4S, v31.s[0] -add v19.4s, v19.4s, v3.4s -ldr q3, [x0, #704] -mul v10.4S, v10.4S,v6.s[0] -sub v26.4s, v24.4s, v28.4s -ldr q0, [x0, #592] -mul v25.4S, v25.4S,v6.s[0] -add v24.4s, v24.4s, v28.4s -ldr q28, [x0, #720] -mla v10.4S, v2.4S, v31.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v15.4s -sqrdmulh v2.4S, v19.4S, v30.s[1] -add v0.4s, v0.4s, v15.4s -mul v19.4S, v19.4S,v5.s[1] -sqrdmulh v15.4S, v16.4S, v30.s[2] -sub v11.4s, v23.4s, v17.4s -mul v16.4S, v16.4S,v5.s[2] -add v23.4s, v23.4s, v17.4s -sqrdmulh v30.4S, v0.4S, v22.s[1] -sub v5.4s, v18.4s, v12.4s -mul v0.4S, v0.4S,v13.s[1] -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v22.s[2] -sub v17.4s, v3.4s, v10.4s -mul v4.4S, v4.4S,v13.s[2] -add v3.4s, v3.4s, v10.4s -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v28.4s, v25.4s -ldr q22, [x0, #992] -sqrdmulh v13.4S, v18.4S, v20.s[1] -add v28.4s, v28.4s, v25.4s -mla v16.4S, v15.4S, v31.s[0] -ldr q15, [x0, #928] -sqrdmulh v25.4S, v5.4S, v20.s[2] -sub v10.4s, v27.4s, v19.4s -mla v0.4S, v30.4S, v31.s[0] -ldr q30, [x0, #800] -sqrdmulh v14.4S, v28.4S, v29.s[1] -add v27.4s, v27.4s, v19.4s -str q10, [x0, #528] -mla v4.4S, v12.4S, v31.s[0] -ldr q12, [x17, #+512] -ldr q10, [x17, #+528] -sqrdmulh v19.4S, v2.4S, v29.s[2] -sub v1.4s, v21.4s, v16.4s -str q27, [x0, #512] -mul v18.4S, v18.4S,v7.s[1] -add v21.4s, v21.4s, v16.4s -mul v5.4S, v5.4S,v7.s[2] -str q1, [x0, #560] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v24.4s, v0.4s -mla v5.4S, v25.4S, v31.s[0] -str q21, [x0, #544] -mul v28.4S, v28.4S,v6.s[1] -str q13, [x0, #592] -mul v2.4S, v2.4S,v6.s[2] -add v24.4s, v24.4s, v0.4s -str q24, [x0, #576] -mla v28.4S, v14.4S, v31.s[0] -sub v14.4s, v26.4s, v4.4s -str q14, [x0, #624] -mla v2.4S, v19.4S, v31.s[0] -add v26.4s, v26.4s, v4.4s -str q26, [x0, #608] -sqrdmulh v29.4S, v30.4S, v10.s[0] -sub v6.4s, v23.4s, v18.4s -mul v30.4S, v30.4S,v12.s[0] -str q6, [x0, #656] -ldr q6, [x0, #816] -sqrdmulh v26.4S, v6.4S, v10.s[0] -add v23.4s, v23.4s, v18.4s -mul v6.4S, v6.4S,v12.s[0] -str q23, [x0, #640] -ldr q23, [x17, #+544] -ldr q18, [x17, #+560] -ldr q4, [x0, #864] -sqrdmulh v19.4S, v4.4S, v18.s[0] -sub v14.4s, v11.4s, v5.4s -mul v4.4S, v4.4S,v23.s[0] -str q14, [x0, #688] -ldr q14, [x0, #880] -sqrdmulh v24.4S, v14.4S, v18.s[0] -add v11.4s, v11.4s, v5.4s -mul v14.4S, v14.4S,v23.s[0] -str q11, [x0, #672] -ldr q11, [x17, #+576] -ldr q5, [x17, #+592] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v28.4s -sqrdmulh v0.4S, v15.4S, v5.s[0] -str q29, [x0, #720] -ldr q29, [x0, #944] -mla v6.4S, v26.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v5.s[0] -str q3, [x0, #704] -ldr q3, [x17, #+608] -ldr q26, [x17, #+624] -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v17.4s, v2.4s -sqrdmulh v13.4S, v22.4S, v26.s[0] -str q19, [x0, #752] -ldr q19, [x0, #1008] -mla v14.4S, v24.4S, v31.s[0] -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v26.s[0] -str q17, [x0, #736] -ldr q17, [x0, #768] -ldr q24, [x0, #896] -mul v15.4S, v15.4S,v11.s[0] -sub v20.4s, v17.4s, v30.4s -ldr q7, [x0, #784] -mul v29.4S, v29.4S,v11.s[0] -add v17.4s, v17.4s, v30.4s -ldr q30, [x0, #912] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v7.4s, v6.4s -ldr q21, [x0, #832] -mla v29.4S, v28.4S, v31.s[0] -add v7.4s, v7.4s, v6.4s -ldr q6, [x0, #960] -mul v22.4S, v22.4S,v3.s[0] -sub v28.4s, v21.4s, v4.4s -ldr q25, [x0, #848] -mul v19.4S, v19.4S,v3.s[0] -add v21.4s, v21.4s, v4.4s -ldr q4, [x0, #976] -mla v22.4S, v13.4S, v31.s[0] -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v14.4s -sqrdmulh v13.4S, v7.4S, v10.s[1] -add v25.4s, v25.4s, v14.4s -mul v7.4S, v7.4S,v12.s[1] -sqrdmulh v14.4S, v0.4S, v10.s[2] -sub v1.4s, v24.4s, v15.4s -mul v0.4S, v0.4S,v12.s[2] -add v24.4s, v24.4s, v15.4s -sqrdmulh v10.4S, v25.4S, v18.s[1] -sub v12.4s, v30.4s, v29.4s -mul v25.4S, v25.4S,v23.s[1] -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v2.4S, v18.s[2] -sub v15.4s, v6.4s, v22.4s -mul v2.4S, v2.4S,v23.s[2] -add v6.4s, v6.4s, v22.4s -mla v7.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v19.4s -sqrdmulh v18.4S, v30.4S, v5.s[1] -add v4.4s, v4.4s, v19.4s -mla v0.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v12.4S, v5.s[2] -sub v19.4s, v17.4s, v7.4s -mla v25.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v4.4S, v26.s[1] -add v17.4s, v17.4s, v7.4s -str q19, [x0, #784] -mla v2.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v13.4S, v26.s[2] -sub v19.4s, v20.4s, v0.4s -str q17, [x0, #768] -mul v30.4S, v30.4S,v11.s[1] -add v20.4s, v20.4s, v0.4s -mul v12.4S, v12.4S,v11.s[2] -str q19, [x0, #816] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v25.4s -mla v12.4S, v14.4S, v31.s[0] -str q20, [x0, #800] -mul v4.4S, v4.4S,v3.s[1] -str q18, [x0, #848] -mul v13.4S, v13.4S,v3.s[2] -add v21.4s, v21.4s, v25.4s -str q21, [x0, #832] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v2.4s -str q10, [x0, #880] -mla v13.4S, v29.4S, v31.s[0] -add v28.4s, v28.4s, v2.4s -str q28, [x0, #864] -sub v26.4s, v24.4s, v30.4s -str q26, [x0, #912] -add v24.4s, v24.4s, v30.4s -str q24, [x0, #896] -sub v24.4s, v1.4s, v12.4s -str q24, [x0, #944] -add v1.4s, v1.4s, v12.4s -str q1, [x0, #928] -sub v1.4s, v6.4s, v4.4s -str q1, [x0, #976] -add v6.4s, v6.4s, v4.4s -str q6, [x0, #960] -sub v6.4s, v15.4s, v13.4s -str q6, [x0, #1008] -add v15.4s, v15.4s, v13.4s -str q15, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1528 -// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s deleted file mode 100644 index 025fcad..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_11_z4_7.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_11_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #928] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #992] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #800] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #864] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #544] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #608] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -ldr q18, [x0, #672] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -ldr q17, [x0, #736] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #288] -ldr q1, [x0, #352] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #32] -ldr q25, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #160] -ldr q15, [x0, #224] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v28.s[1] -mul v16.4S, v16.4S,v29.s[1] -sqrdmulh v21.4S, v3.4S, v28.s[1] -sub v14.4s, v26.4s, v20.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v30.4S, v30.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v1.4S, v1.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v2.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v3.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v27.4S, v28.s[2] -mla v30.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v0.4S, v28.s[2] -mla v1.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v23.4S, v28.s[2] -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v2.4S, v2.4S,v29.s[2] -sub v10.4s, v24.4s, v16.4s -mul v27.4S, v27.4S,v29.s[2] -add v24.4s, v24.4s, v16.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v3.4s -mla v27.4S, v21.4S, v31.s[0] -add v15.4s, v15.4s, v3.4s -mul v0.4S, v0.4S,v29.s[2] -sub v3.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v29.s[2] -add v26.4s, v26.4s, v30.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v1.4s -mla v23.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v11.s[1] -mul v10.4S, v10.4S,v17.s[1] -sqrdmulh v19.4S, v18.4S, v11.s[1] -sub v30.4s, v12.4s, v2.4s -mul v18.4S, v18.4S,v17.s[1] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v11.s[0] -sub v21.4s, v22.4s, v27.4s -mul v24.4S, v24.4S,v17.s[0] -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v11.s[0] -sub v16.4s, v14.4s, v0.4s -mul v15.4S, v15.4S,v17.s[0] -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+64] -ldr q9, [x17, #+80] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v23.4s -sqrdmulh v8.4S, v12.4S, v11.s[2] -add v13.4s, v13.4s, v23.4s -mla v18.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v22.4S, v11.s[2] -mla v24.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v30.4S, v11.s[3] -mla v15.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v21.4S, v11.s[3] -ldr q23, [x17, #+96] -ldr q7, [x17, #+112] -mul v12.4S, v12.4S,v17.s[2] -sub v6.4s, v3.4s, v10.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v10.4s -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v22.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v30.4S, v30.4S,v17.s[3] -sub v18.4s, v26.4s, v24.4s -mul v21.4S, v21.4S,v17.s[3] -add v26.4s, v26.4s, v24.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v15.4s -mla v21.4S, v27.4S, v31.s[0] -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v9.s[2] -mul v20.4S, v20.4S,v0.s[2] -sqrdmulh v27.4S, v8.4S, v9.s[3] -sub v24.4s, v14.4s, v12.4s -mul v8.4S, v8.4S,v0.s[3] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v9.s[1] -sub v19.4s, v13.4s, v22.4s -mul v2.4S, v2.4S,v0.s[1] -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[0] -sub v10.4s, v16.4s, v30.4s -mul v25.4S, v25.4S,v0.s[0] -add v16.4s, v16.4s, v30.4s -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -sqrdmulh v30.4S, v13.4S, v7.s[0] -add v1.4s, v1.4s, v21.4s -mla v8.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v19.4S, v7.s[1] -mla v2.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v1.4S, v7.s[2] -mla v25.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v15.4S, v7.s[3] -mul v13.4S, v13.4S,v23.s[0] -sub v21.4s, v3.4s, v20.4s -str q21, [x0, #352] -mul v19.4S, v19.4S,v23.s[1] -add v3.4s, v3.4s, v20.4s -str q3, [x0, #288] -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v8.4s -str q30, [x0, #480] -mla v19.4S, v27.4S, v31.s[0] -add v6.4s, v6.4s, v8.4s -str q6, [x0, #416] -mul v1.4S, v1.4S,v23.s[2] -sub v6.4s, v18.4s, v2.4s -str q6, [x0, #224] -mul v15.4S, v15.4S,v23.s[3] -add v18.4s, v18.4s, v2.4s -str q18, [x0, #160] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v26.4s, v25.4s -str q12, [x0, #96] -mla v15.4S, v22.4S, v31.s[0] -add v26.4s, v26.4s, v25.4s -str q26, [x0, #32] -ldr q26, [x0, #944] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q22, [x0, #1008] -sqrdmulh v12.4S, v22.4S, v28.s[0] -sub v18.4s, v14.4s, v13.4s -str q18, [x0, #608] -mul v22.4S, v22.4S,v29.s[0] -add v14.4s, v14.4s, v13.4s -str q14, [x0, #544] -ldr q14, [x0, #816] -sqrdmulh v13.4S, v14.4S, v28.s[0] -sub v18.4s, v24.4s, v19.4s -str q18, [x0, #736] -mul v14.4S, v14.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -str q24, [x0, #672] -ldr q24, [x0, #880] -sqrdmulh v19.4S, v24.4S, v28.s[0] -sub v18.4s, v16.4s, v1.4s -str q18, [x0, #864] -mul v24.4S, v24.4S,v29.s[0] -add v16.4s, v16.4s, v1.4s -str q16, [x0, #800] -ldr q16, [x0, #560] -mla v26.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v15.4s -str q25, [x0, #992] -sqrdmulh v25.4S, v16.4S, v28.s[0] -add v10.4s, v10.4s, v15.4s -str q10, [x0, #928] -ldr q10, [x0, #624] -mla v22.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v10.4S, v28.s[0] -ldr q15, [x0, #688] -mla v14.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v15.4S, v28.s[0] -ldr q1, [x0, #752] -mla v24.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v1.4S, v28.s[0] -ldr q18, [x0, #432] -ldr q2, [x0, #496] -mul v16.4S, v16.4S,v29.s[0] -sub v6.4s, v18.4s, v26.4s -mul v10.4S, v10.4S,v29.s[0] -add v18.4s, v18.4s, v26.4s -ldr q26, [x0, #304] -ldr q8, [x0, #368] -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v2.4s, v22.4s -mla v10.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v22.4s -ldr q22, [x0, #48] -ldr q12, [x0, #112] -mul v15.4S, v15.4S,v29.s[0] -sub v27.4s, v26.4s, v14.4s -mul v1.4S, v1.4S,v29.s[0] -add v26.4s, v26.4s, v14.4s -ldr q14, [x0, #176] -ldr q30, [x0, #240] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v24.4s -mla v1.4S, v19.4S, v31.s[0] -add v8.4s, v8.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v28.s[1] -mul v18.4S, v18.4S,v29.s[1] -sqrdmulh v19.4S, v2.4S, v28.s[1] -sub v3.4s, v22.4s, v16.4s -mul v2.4S, v2.4S,v29.s[1] -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v26.4S, v28.s[1] -sub v20.4s, v12.4s, v10.4s -mul v26.4S, v26.4S,v29.s[1] -add v12.4s, v12.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v28.s[1] -sub v21.4s, v14.4s, v15.4s -mul v8.4S, v8.4S,v29.s[1] -add v14.4s, v14.4s, v15.4s -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v30.4s, v1.4s -sqrdmulh v15.4S, v6.4S, v28.s[2] -add v30.4s, v30.4s, v1.4s -mla v2.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v25.4S, v28.s[2] -mla v26.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v27.4S, v28.s[2] -mla v8.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v13.4S, v28.s[2] -mul v6.4S, v6.4S,v29.s[2] -sub v1.4s, v14.4s, v18.4s -mul v25.4S, v25.4S,v29.s[2] -add v14.4s, v14.4s, v18.4s -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v2.4s -mla v25.4S, v19.4S, v31.s[0] -add v30.4s, v30.4s, v2.4s -mul v27.4S, v27.4S,v29.s[2] -sub v2.4s, v22.4s, v26.4s -mul v13.4S, v13.4S,v29.s[2] -add v22.4s, v22.4s, v26.4s -mla v27.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v8.4s -mla v13.4S, v10.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v1.4S, v11.s[1] -mul v1.4S, v1.4S,v17.s[1] -sqrdmulh v10.4S, v15.4S, v11.s[1] -sub v26.4s, v21.4s, v6.4s -mul v15.4S, v15.4S,v17.s[1] -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v14.4S, v11.s[0] -sub v19.4s, v24.4s, v25.4s -mul v14.4S, v14.4S,v17.s[0] -add v24.4s, v24.4s, v25.4s -sqrdmulh v25.4S, v30.4S, v11.s[0] -sub v18.4s, v3.4s, v27.4s -mul v30.4S, v30.4S,v17.s[0] -add v3.4s, v3.4s, v27.4s -mla v1.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v13.4s -sqrdmulh v27.4S, v21.4S, v11.s[2] -add v20.4s, v20.4s, v13.4s -mla v15.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v24.4S, v11.s[2] -mla v14.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v26.4S, v11.s[3] -mla v30.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v11.s[3] -mul v21.4S, v21.4S,v17.s[2] -sub v13.4s, v2.4s, v1.4s -mul v24.4S, v24.4S,v17.s[2] -add v2.4s, v2.4s, v1.4s -mla v21.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v15.4s -mla v24.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v15.4s -mul v26.4S, v26.4S,v17.s[3] -sub v15.4s, v22.4s, v14.4s -mul v19.4S, v19.4S,v17.s[3] -add v22.4s, v22.4s, v14.4s -mla v26.4S, v6.4S, v31.s[0] -sub v6.4s, v12.4s, v30.4s -mla v19.4S, v25.4S, v31.s[0] -add v12.4s, v12.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v9.s[2] -mul v16.4S, v16.4S,v0.s[2] -sqrdmulh v25.4S, v27.4S, v9.s[3] -sub v14.4s, v3.4s, v21.4s -mul v27.4S, v27.4S,v0.s[3] -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v6.4S, v9.s[1] -sub v10.4s, v20.4s, v24.4s -mul v6.4S, v6.4S,v0.s[1] -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v12.4S, v9.s[0] -sub v1.4s, v18.4s, v26.4s -mul v12.4S, v12.4S,v0.s[0] -add v18.4s, v18.4s, v26.4s -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v19.4s -sqrdmulh v26.4S, v20.4S, v7.s[0] -add v8.4s, v8.4s, v19.4s -mla v27.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v10.4S, v7.s[1] -mla v6.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v8.4S, v7.s[2] -mla v12.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v30.4S, v7.s[3] -mul v20.4S, v20.4S,v23.s[0] -sub v19.4s, v2.4s, v16.4s -str q19, [x0, #368] -mul v10.4S, v10.4S,v23.s[1] -add v2.4s, v2.4s, v16.4s -str q2, [x0, #304] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v13.4s, v27.4s -str q26, [x0, #496] -mla v10.4S, v25.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -str q13, [x0, #432] -mul v8.4S, v8.4S,v23.s[2] -sub v13.4s, v15.4s, v6.4s -str q13, [x0, #240] -mul v30.4S, v30.4S,v23.s[3] -add v15.4s, v15.4s, v6.4s -str q15, [x0, #176] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v12.4s -str q21, [x0, #112] -mla v30.4S, v24.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -str q22, [x0, #48] -ldr q22, [x0, #896] -sqrdmulh v12.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q24, [x0, #960] -sqrdmulh v21.4S, v24.4S, v28.s[0] -sub v15.4s, v3.4s, v20.4s -str q15, [x0, #624] -mul v24.4S, v24.4S,v29.s[0] -add v3.4s, v3.4s, v20.4s -str q3, [x0, #560] -ldr q3, [x0, #768] -sqrdmulh v20.4S, v3.4S, v28.s[0] -sub v15.4s, v14.4s, v10.4s -str q15, [x0, #752] -mul v3.4S, v3.4S,v29.s[0] -add v14.4s, v14.4s, v10.4s -str q14, [x0, #688] -ldr q14, [x0, #832] -sqrdmulh v10.4S, v14.4S, v28.s[0] -sub v15.4s, v18.4s, v8.4s -str q15, [x0, #880] -mul v14.4S, v14.4S,v29.s[0] -add v18.4s, v18.4s, v8.4s -str q18, [x0, #816] -ldr q18, [x0, #512] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v30.4s -str q12, [x0, #1008] -sqrdmulh v12.4S, v18.4S, v28.s[0] -add v1.4s, v1.4s, v30.4s -str q1, [x0, #944] -ldr q1, [x0, #576] -mla v24.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v1.4S, v28.s[0] -ldr q30, [x0, #640] -mla v3.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v30.4S, v28.s[0] -ldr q8, [x0, #704] -mla v14.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v8.4S, v28.s[0] -ldr q15, [x0, #384] -ldr q6, [x0, #448] -mul v18.4S, v18.4S,v29.s[0] -sub v13.4s, v15.4s, v22.4s -mul v1.4S, v1.4S,v29.s[0] -add v15.4s, v15.4s, v22.4s -ldr q22, [x0, #256] -ldr q27, [x0, #320] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v24.4s -mla v1.4S, v21.4S, v31.s[0] -add v6.4s, v6.4s, v24.4s -ldr q24, [x0, #0] -ldr q21, [x0, #64] -mul v30.4S, v30.4S,v29.s[0] -sub v25.4s, v22.4s, v3.4s -mul v8.4S, v8.4S,v29.s[0] -add v22.4s, v22.4s, v3.4s -ldr q3, [x0, #128] -ldr q26, [x0, #192] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v27.4s, v14.4s -mla v8.4S, v10.4S, v31.s[0] -add v27.4s, v27.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v28.s[1] -mul v15.4S, v15.4S,v29.s[1] -sqrdmulh v10.4S, v6.4S, v28.s[1] -sub v2.4s, v24.4s, v18.4s -mul v6.4S, v6.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -sqrdmulh v18.4S, v22.4S, v28.s[1] -sub v16.4s, v21.4s, v1.4s -mul v22.4S, v22.4S,v29.s[1] -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v27.4S, v28.s[1] -sub v19.4s, v3.4s, v30.4s -mul v27.4S, v27.4S,v29.s[1] -add v3.4s, v3.4s, v30.4s -mla v15.4S, v14.4S, v31.s[0] -sub v14.4s, v26.4s, v8.4s -sqrdmulh v30.4S, v13.4S, v28.s[2] -add v26.4s, v26.4s, v8.4s -mla v6.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v12.4S, v28.s[2] -mla v22.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v25.4S, v28.s[2] -mla v27.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v20.4S, v28.s[2] -mul v13.4S, v13.4S,v29.s[2] -sub v8.4s, v3.4s, v15.4s -mul v12.4S, v12.4S,v29.s[2] -add v3.4s, v3.4s, v15.4s -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v6.4s -mla v12.4S, v10.4S, v31.s[0] -add v26.4s, v26.4s, v6.4s -mul v25.4S, v25.4S,v29.s[2] -sub v6.4s, v24.4s, v22.4s -mul v20.4S, v20.4S,v29.s[2] -add v24.4s, v24.4s, v22.4s -mla v25.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v27.4s -mla v20.4S, v1.4S, v31.s[0] -add v21.4s, v21.4s, v27.4s -sqrdmulh v27.4S, v8.4S, v11.s[1] -mul v8.4S, v8.4S,v17.s[1] -sqrdmulh v1.4S, v30.4S, v11.s[1] -sub v22.4s, v19.4s, v13.4s -mul v30.4S, v30.4S,v17.s[1] -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v3.4S, v11.s[0] -sub v10.4s, v14.4s, v12.4s -mul v3.4S, v3.4S,v17.s[0] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v11.s[0] -sub v15.4s, v2.4s, v25.4s -mul v26.4S, v26.4S,v17.s[0] -add v2.4s, v2.4s, v25.4s -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v16.4s, v20.4s -sqrdmulh v25.4S, v19.4S, v11.s[2] -add v16.4s, v16.4s, v20.4s -mla v30.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v14.4S, v11.s[2] -mla v3.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v22.4S, v11.s[3] -mla v26.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v10.4S, v11.s[3] -mul v19.4S, v19.4S,v17.s[2] -sub v20.4s, v6.4s, v8.4s -mul v14.4S, v14.4S,v17.s[2] -add v6.4s, v6.4s, v8.4s -mla v19.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v30.4s -mla v14.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -mul v22.4S, v22.4S,v17.s[3] -sub v30.4s, v24.4s, v3.4s -mul v10.4S, v10.4S,v17.s[3] -add v24.4s, v24.4s, v3.4s -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v26.4s -mla v10.4S, v12.4S, v31.s[0] -add v21.4s, v21.4s, v26.4s -sqrdmulh v26.4S, v18.4S, v9.s[2] -mul v18.4S, v18.4S,v0.s[2] -sqrdmulh v12.4S, v25.4S, v9.s[3] -sub v3.4s, v2.4s, v19.4s -mul v25.4S, v25.4S,v0.s[3] -add v2.4s, v2.4s, v19.4s -sqrdmulh v19.4S, v13.4S, v9.s[1] -sub v1.4s, v16.4s, v14.4s -mul v13.4S, v13.4S,v0.s[1] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v9.s[0] -sub v8.4s, v15.4s, v22.4s -mul v21.4S, v21.4S,v0.s[0] -add v15.4s, v15.4s, v22.4s -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v10.4s -sqrdmulh v22.4S, v16.4S, v7.s[0] -add v27.4s, v27.4s, v10.4s -mla v25.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v1.4S, v7.s[1] -mla v13.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v21.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v26.4S, v7.s[3] -mul v16.4S, v16.4S,v23.s[0] -sub v10.4s, v6.4s, v18.4s -str q10, [x0, #320] -mul v1.4S, v1.4S,v23.s[1] -add v6.4s, v6.4s, v18.4s -str q6, [x0, #256] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v25.4s -str q22, [x0, #448] -mla v1.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v25.4s -str q20, [x0, #384] -mul v27.4S, v27.4S,v23.s[2] -sub v20.4s, v30.4s, v13.4s -str q20, [x0, #192] -mul v26.4S, v26.4S,v23.s[3] -add v30.4s, v30.4s, v13.4s -str q30, [x0, #128] -mla v27.4S, v19.4S, v31.s[0] -sub v19.4s, v24.4s, v21.4s -str q19, [x0, #64] -mla v26.4S, v14.4S, v31.s[0] -add v24.4s, v24.4s, v21.4s -str q24, [x0, #0] -ldr q24, [x0, #912] -sqrdmulh v21.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q14, [x0, #976] -sqrdmulh v19.4S, v14.4S, v28.s[0] -sub v30.4s, v2.4s, v16.4s -str q30, [x0, #576] -mul v14.4S, v14.4S,v29.s[0] -add v2.4s, v2.4s, v16.4s -str q2, [x0, #512] -ldr q2, [x0, #784] -sqrdmulh v16.4S, v2.4S, v28.s[0] -sub v30.4s, v3.4s, v1.4s -str q30, [x0, #704] -mul v2.4S, v2.4S,v29.s[0] -add v3.4s, v3.4s, v1.4s -str q3, [x0, #640] -ldr q3, [x0, #848] -sqrdmulh v1.4S, v3.4S, v28.s[0] -sub v30.4s, v15.4s, v27.4s -str q30, [x0, #832] -mul v3.4S, v3.4S,v29.s[0] -add v15.4s, v15.4s, v27.4s -str q15, [x0, #768] -ldr q15, [x0, #528] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v26.4s -str q21, [x0, #960] -sqrdmulh v21.4S, v15.4S, v28.s[0] -add v8.4s, v8.4s, v26.4s -str q8, [x0, #896] -ldr q8, [x0, #592] -mla v14.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v8.4S, v28.s[0] -ldr q26, [x0, #656] -mla v2.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v26.4S, v28.s[0] -ldr q27, [x0, #720] -mla v3.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v27.4S, v28.s[0] -ldr q30, [x0, #400] -ldr q13, [x0, #464] -mul v15.4S, v15.4S,v29.s[0] -sub v20.4s, v30.4s, v24.4s -mul v8.4S, v8.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #272] -ldr q25, [x0, #336] -mla v15.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v14.4s -mla v8.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -ldr q14, [x0, #16] -ldr q19, [x0, #80] -mul v26.4S, v26.4S,v29.s[0] -sub v12.4s, v24.4s, v2.4s -mul v27.4S, v27.4S,v29.s[0] -add v24.4s, v24.4s, v2.4s -ldr q2, [x0, #144] -ldr q22, [x0, #208] -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v3.4s -mla v27.4S, v1.4S, v31.s[0] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v28.s[1] -mul v30.4S, v30.4S,v29.s[1] -sqrdmulh v1.4S, v13.4S, v28.s[1] -sub v6.4s, v14.4s, v15.4s -mul v13.4S, v13.4S,v29.s[1] -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v24.4S, v28.s[1] -sub v18.4s, v19.4s, v8.4s -mul v24.4S, v24.4S,v29.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v25.4S, v28.s[1] -sub v10.4s, v2.4s, v26.4s -mul v25.4S, v25.4S,v29.s[1] -add v2.4s, v2.4s, v26.4s -mla v30.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v27.4s -sqrdmulh v26.4S, v20.4S, v28.s[2] -add v22.4s, v22.4s, v27.4s -mla v13.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v21.4S, v28.s[2] -mla v24.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v12.4S, v28.s[2] -mla v25.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v16.4S, v28.s[2] -mul v20.4S, v20.4S,v29.s[2] -sub v27.4s, v2.4s, v30.4s -mul v21.4S, v21.4S,v29.s[2] -add v2.4s, v2.4s, v30.4s -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v22.4s, v13.4s -mla v21.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -mul v12.4S, v12.4S,v29.s[2] -sub v13.4s, v14.4s, v24.4s -mul v16.4S, v16.4S,v29.s[2] -add v14.4s, v14.4s, v24.4s -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v25.4s -mla v16.4S, v8.4S, v31.s[0] -add v19.4s, v19.4s, v25.4s -sqrdmulh v28.4S, v27.4S, v11.s[1] -mul v27.4S, v27.4S,v17.s[1] -sqrdmulh v29.4S, v26.4S, v11.s[1] -sub v25.4s, v10.4s, v20.4s -mul v26.4S, v26.4S,v17.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v11.s[0] -sub v8.4s, v3.4s, v21.4s -mul v2.4S, v2.4S,v17.s[0] -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v11.s[0] -sub v24.4s, v6.4s, v12.4s -mul v22.4S, v22.4S,v17.s[0] -add v6.4s, v6.4s, v12.4s -mla v27.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v16.4s -sqrdmulh v12.4S, v10.4S, v11.s[2] -add v18.4s, v18.4s, v16.4s -mla v26.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v3.4S, v11.s[2] -mla v2.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v25.4S, v11.s[3] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v8.4S, v11.s[3] -mul v10.4S, v10.4S,v17.s[2] -sub v16.4s, v13.4s, v27.4s -mul v3.4S, v3.4S,v17.s[2] -add v13.4s, v13.4s, v27.4s -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v26.4s -mla v3.4S, v29.4S, v31.s[0] -add v15.4s, v15.4s, v26.4s -mul v25.4S, v25.4S,v17.s[3] -sub v26.4s, v14.4s, v2.4s -mul v8.4S, v8.4S,v17.s[3] -add v14.4s, v14.4s, v2.4s -mla v25.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v22.4s -mla v8.4S, v21.4S, v31.s[0] -add v19.4s, v19.4s, v22.4s -sqrdmulh v11.4S, v15.4S, v9.s[2] -mul v15.4S, v15.4S,v0.s[2] -sqrdmulh v17.4S, v12.4S, v9.s[3] -sub v22.4s, v6.4s, v10.4s -mul v12.4S, v12.4S,v0.s[3] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v20.4S, v9.s[1] -sub v21.4s, v18.4s, v3.4s -mul v20.4S, v20.4S,v0.s[1] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v19.4S, v9.s[0] -sub v2.4s, v24.4s, v25.4s -mul v19.4S, v19.4S,v0.s[0] -add v24.4s, v24.4s, v25.4s -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v8.4s -sqrdmulh v25.4S, v18.4S, v7.s[0] -add v28.4s, v28.4s, v8.4s -mla v12.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v21.4S, v7.s[1] -mla v20.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v28.4S, v7.s[2] -mla v19.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v11.4S, v7.s[3] -mul v18.4S, v18.4S,v23.s[0] -sub v8.4s, v13.4s, v15.4s -str q8, [x0, #336] -mul v21.4S, v21.4S,v23.s[1] -add v13.4s, v13.4s, v15.4s -str q13, [x0, #272] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v12.4s -str q25, [x0, #464] -mla v21.4S, v17.4S, v31.s[0] -add v16.4s, v16.4s, v12.4s -str q16, [x0, #400] -mul v28.4S, v28.4S,v23.s[2] -sub v16.4s, v26.4s, v20.4s -str q16, [x0, #208] -mul v11.4S, v11.4S,v23.s[3] -add v26.4s, v26.4s, v20.4s -str q26, [x0, #144] -mla v28.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v19.4s -str q10, [x0, #80] -mla v11.4S, v3.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q14, [x0, #16] -sub v7.4s, v6.4s, v18.4s -str q7, [x0, #592] -add v6.4s, v6.4s, v18.4s -str q6, [x0, #528] -sub v6.4s, v22.4s, v21.4s -str q6, [x0, #720] -add v22.4s, v22.4s, v21.4s -str q22, [x0, #656] -sub v22.4s, v24.4s, v28.4s -str q22, [x0, #848] -add v24.4s, v24.4s, v28.4s -str q24, [x0, #784] -sub v24.4s, v2.4s, v11.4s -str q24, [x0, #976] -add v2.4s, v2.4s, v11.4s -str q2, [x0, #912] -ldr q4, [x0, #224] -ldr q5, [x0, #160] -ldr q30, [x0, #32] -ldr q1, [x17, #+128] -ldr q27, [x17, #+144] -sqrdmulh v29.4S, v30.4S, v27.s[0] -mul v30.4S, v30.4S,v1.s[0] -ldr q8, [x0, #48] -sqrdmulh v15.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v1.s[0] -ldr q13, [x17, #+160] -ldr q25, [x17, #+176] -ldr q17, [x0, #96] -sqrdmulh v12.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v13.s[0] -ldr q16, [x0, #112] -sqrdmulh v20.4S, v16.4S, v25.s[0] -mul v16.4S, v16.4S,v13.s[0] -ldr q26, [x17, #+192] -ldr q10, [x17, #+208] -mla v30.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v5.4S, v10.s[0] -ldr q3, [x0, #176] -mla v8.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v3.4S, v10.s[0] -ldr q19, [x17, #+224] -ldr q14, [x17, #+240] -mla v17.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v4.4S, v14.s[0] -ldr q0, [x0, #240] -mla v16.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v0.4S, v14.s[0] -ldr q9, [x0, #0] -ldr q23, [x0, #128] -mul v5.4S, v5.4S,v26.s[0] -sub v7.4s, v9.4s, v30.4s -ldr q18, [x0, #16] -mul v3.4S, v3.4S,v26.s[0] -add v9.4s, v9.4s, v30.4s -ldr q30, [x0, #144] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v8.4s -ldr q6, [x0, #64] -mla v3.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v8.4s -ldr q8, [x0, #192] -mul v4.4S, v4.4S,v19.s[0] -sub v15.4s, v6.4s, v17.4s -ldr q21, [x0, #80] -mul v0.4S, v0.4S,v19.s[0] -add v6.4s, v6.4s, v17.4s -ldr q17, [x0, #208] -mla v4.4S, v12.4S, v31.s[0] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v16.4s -sqrdmulh v12.4S, v18.4S, v27.s[1] -add v21.4s, v21.4s, v16.4s -mul v18.4S, v18.4S,v1.s[1] -sqrdmulh v16.4S, v29.4S, v27.s[2] -sub v22.4s, v23.4s, v5.4s -mul v29.4S, v29.4S,v1.s[2] -add v23.4s, v23.4s, v5.4s -sqrdmulh v27.4S, v21.4S, v25.s[1] -sub v1.4s, v30.4s, v3.4s -mul v21.4S, v21.4S,v13.s[1] -add v30.4s, v30.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v25.s[2] -sub v5.4s, v8.4s, v4.4s -mul v20.4S, v20.4S,v13.s[2] -add v8.4s, v8.4s, v4.4s -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v0.4s -ldr q25, [x0, #480] -sqrdmulh v13.4S, v30.4S, v10.s[1] -add v17.4s, v17.4s, v0.4s -mla v29.4S, v16.4S, v31.s[0] -ldr q16, [x0, #416] -sqrdmulh v0.4S, v1.4S, v10.s[2] -sub v4.4s, v9.4s, v18.4s -mla v21.4S, v27.4S, v31.s[0] -ldr q27, [x0, #288] -sqrdmulh v28.4S, v17.4S, v14.s[1] -add v9.4s, v9.4s, v18.4s -str q4, [x0, #16] -mla v20.4S, v3.4S, v31.s[0] -ldr q3, [x17, #+256] -ldr q4, [x17, #+272] -sqrdmulh v18.4S, v12.4S, v14.s[2] -sub v24.4s, v7.4s, v29.4s -str q9, [x0, #0] -mul v30.4S, v30.4S,v26.s[1] -add v7.4s, v7.4s, v29.4s -mul v1.4S, v1.4S,v26.s[2] -str q24, [x0, #48] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v6.4s, v21.4s -mla v1.4S, v0.4S, v31.s[0] -str q7, [x0, #32] -mul v17.4S, v17.4S,v19.s[1] -str q13, [x0, #80] -mul v12.4S, v12.4S,v19.s[2] -add v6.4s, v6.4s, v21.4s -str q6, [x0, #64] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v15.4s, v20.4s -str q28, [x0, #112] -mla v12.4S, v18.4S, v31.s[0] -add v15.4s, v15.4s, v20.4s -str q15, [x0, #96] -sqrdmulh v14.4S, v27.4S, v4.s[0] -sub v19.4s, v23.4s, v30.4s -mul v27.4S, v27.4S,v3.s[0] -str q19, [x0, #144] -ldr q19, [x0, #304] -sqrdmulh v15.4S, v19.4S, v4.s[0] -add v23.4s, v23.4s, v30.4s -mul v19.4S, v19.4S,v3.s[0] -str q23, [x0, #128] -ldr q23, [x17, #+288] -ldr q30, [x17, #+304] -ldr q20, [x0, #352] -sqrdmulh v18.4S, v20.4S, v30.s[0] -sub v28.4s, v22.4s, v1.4s -mul v20.4S, v20.4S,v23.s[0] -str q28, [x0, #176] -ldr q28, [x0, #368] -sqrdmulh v6.4S, v28.4S, v30.s[0] -add v22.4s, v22.4s, v1.4s -mul v28.4S, v28.4S,v23.s[0] -str q22, [x0, #160] -ldr q22, [x17, #+320] -ldr q1, [x17, #+336] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v17.4s -sqrdmulh v21.4S, v16.4S, v1.s[0] -str q14, [x0, #208] -ldr q14, [x0, #432] -mla v19.4S, v15.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v1.s[0] -str q8, [x0, #192] -ldr q8, [x17, #+352] -ldr q15, [x17, #+368] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v5.4s, v12.4s -sqrdmulh v13.4S, v25.4S, v15.s[0] -str q18, [x0, #240] -ldr q18, [x0, #496] -mla v28.4S, v6.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v15.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q6, [x0, #384] -mul v16.4S, v16.4S,v22.s[0] -sub v10.4s, v5.4s, v27.4s -ldr q26, [x0, #272] -mul v14.4S, v14.4S,v22.s[0] -add v5.4s, v5.4s, v27.4s -ldr q27, [x0, #400] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v26.4s, v19.4s -ldr q7, [x0, #320] -mla v14.4S, v17.4S, v31.s[0] -add v26.4s, v26.4s, v19.4s -ldr q19, [x0, #448] -mul v25.4S, v25.4S,v8.s[0] -sub v17.4s, v7.4s, v20.4s -ldr q0, [x0, #336] -mul v18.4S, v18.4S,v8.s[0] -add v7.4s, v7.4s, v20.4s -ldr q20, [x0, #464] -mla v25.4S, v13.4S, v31.s[0] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v28.4s -sqrdmulh v13.4S, v26.4S, v4.s[1] -add v0.4s, v0.4s, v28.4s -mul v26.4S, v26.4S,v3.s[1] -sqrdmulh v28.4S, v21.4S, v4.s[2] -sub v24.4s, v6.4s, v16.4s -mul v21.4S, v21.4S,v3.s[2] -add v6.4s, v6.4s, v16.4s -sqrdmulh v4.4S, v0.4S, v30.s[1] -sub v3.4s, v27.4s, v14.4s -mul v0.4S, v0.4S,v23.s[1] -add v27.4s, v27.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v30.s[2] -sub v16.4s, v19.4s, v25.4s -mul v12.4S, v12.4S,v23.s[2] -add v19.4s, v19.4s, v25.4s -mla v26.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v18.4s -ldr q30, [x0, #736] -sqrdmulh v23.4S, v27.4S, v1.s[1] -add v20.4s, v20.4s, v18.4s -mla v21.4S, v28.4S, v31.s[0] -ldr q28, [x0, #672] -sqrdmulh v18.4S, v3.4S, v1.s[2] -sub v25.4s, v5.4s, v26.4s -mla v0.4S, v4.4S, v31.s[0] -ldr q4, [x0, #544] -sqrdmulh v29.4S, v20.4S, v15.s[1] -add v5.4s, v5.4s, v26.4s -str q25, [x0, #272] -mla v12.4S, v14.4S, v31.s[0] -ldr q14, [x17, #+384] -ldr q25, [x17, #+400] -sqrdmulh v26.4S, v13.4S, v15.s[2] -sub v9.4s, v10.4s, v21.4s -str q5, [x0, #256] -mul v27.4S, v27.4S,v22.s[1] -add v10.4s, v10.4s, v21.4s -mul v3.4S, v3.4S,v22.s[2] -str q9, [x0, #304] -mla v27.4S, v23.4S, v31.s[0] -sub v23.4s, v7.4s, v0.4s -mla v3.4S, v18.4S, v31.s[0] -str q10, [x0, #288] -mul v20.4S, v20.4S,v8.s[1] -str q23, [x0, #336] -mul v13.4S, v13.4S,v8.s[2] -add v7.4s, v7.4s, v0.4s -str q7, [x0, #320] -mla v20.4S, v29.4S, v31.s[0] -sub v29.4s, v17.4s, v12.4s -str q29, [x0, #368] -mla v13.4S, v26.4S, v31.s[0] -add v17.4s, v17.4s, v12.4s -str q17, [x0, #352] -sqrdmulh v15.4S, v4.4S, v25.s[0] -sub v8.4s, v6.4s, v27.4s -mul v4.4S, v4.4S,v14.s[0] -str q8, [x0, #400] -ldr q8, [x0, #560] -sqrdmulh v17.4S, v8.4S, v25.s[0] -add v6.4s, v6.4s, v27.4s -mul v8.4S, v8.4S,v14.s[0] -str q6, [x0, #384] -ldr q6, [x17, #+416] -ldr q27, [x17, #+432] -ldr q12, [x0, #608] -sqrdmulh v26.4S, v12.4S, v27.s[0] -sub v29.4s, v24.4s, v3.4s -mul v12.4S, v12.4S,v6.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v7.4S, v29.4S, v27.s[0] -add v24.4s, v24.4s, v3.4s -mul v29.4S, v29.4S,v6.s[0] -str q24, [x0, #416] -ldr q24, [x17, #+448] -ldr q3, [x17, #+464] -mla v4.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v20.4s -sqrdmulh v0.4S, v28.4S, v3.s[0] -str q15, [x0, #464] -ldr q15, [x0, #688] -mla v8.4S, v17.4S, v31.s[0] -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v3.s[0] -str q19, [x0, #448] -ldr q19, [x17, #+480] -ldr q17, [x17, #+496] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v13.4s -sqrdmulh v23.4S, v30.4S, v17.s[0] -str q26, [x0, #496] -ldr q26, [x0, #752] -mla v29.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v17.s[0] -str q16, [x0, #480] -ldr q16, [x0, #512] -ldr q7, [x0, #640] -mul v28.4S, v28.4S,v24.s[0] -sub v1.4s, v16.4s, v4.4s -ldr q22, [x0, #528] -mul v15.4S, v15.4S,v24.s[0] -add v16.4s, v16.4s, v4.4s -ldr q4, [x0, #656] -mla v28.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v8.4s -ldr q10, [x0, #576] -mla v15.4S, v20.4S, v31.s[0] -add v22.4s, v22.4s, v8.4s -ldr q8, [x0, #704] -mul v30.4S, v30.4S,v19.s[0] -sub v20.4s, v10.4s, v12.4s -ldr q18, [x0, #592] -mul v26.4S, v26.4S,v19.s[0] -add v10.4s, v10.4s, v12.4s -ldr q12, [x0, #720] -mla v30.4S, v23.4S, v31.s[0] -mla v26.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v29.4s -sqrdmulh v23.4S, v22.4S, v25.s[1] -add v18.4s, v18.4s, v29.4s -mul v22.4S, v22.4S,v14.s[1] -sqrdmulh v29.4S, v0.4S, v25.s[2] -sub v9.4s, v7.4s, v28.4s -mul v0.4S, v0.4S,v14.s[2] -add v7.4s, v7.4s, v28.4s -sqrdmulh v25.4S, v18.4S, v27.s[1] -sub v14.4s, v4.4s, v15.4s -mul v18.4S, v18.4S,v6.s[1] -add v4.4s, v4.4s, v15.4s -sqrdmulh v15.4S, v13.4S, v27.s[2] -sub v28.4s, v8.4s, v30.4s -mul v13.4S, v13.4S,v6.s[2] -add v8.4s, v8.4s, v30.4s -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v12.4s, v26.4s -ldr q27, [x0, #992] -sqrdmulh v6.4S, v4.4S, v3.s[1] -add v12.4s, v12.4s, v26.4s -mla v0.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v26.4S, v14.4S, v3.s[2] -sub v30.4s, v16.4s, v22.4s -mla v18.4S, v25.4S, v31.s[0] -ldr q25, [x0, #800] -sqrdmulh v21.4S, v12.4S, v17.s[1] -add v16.4s, v16.4s, v22.4s -str q30, [x0, #528] -mla v13.4S, v15.4S, v31.s[0] -ldr q15, [x17, #+512] -ldr q30, [x17, #+528] -sqrdmulh v22.4S, v23.4S, v17.s[2] -sub v5.4s, v1.4s, v0.4s -str q16, [x0, #512] -mul v4.4S, v4.4S,v24.s[1] -add v1.4s, v1.4s, v0.4s -mul v14.4S, v14.4S,v24.s[2] -str q5, [x0, #560] -mla v4.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v18.4s -mla v14.4S, v26.4S, v31.s[0] -str q1, [x0, #544] -mul v12.4S, v12.4S,v19.s[1] -str q6, [x0, #592] -mul v23.4S, v23.4S,v19.s[2] -add v10.4s, v10.4s, v18.4s -str q10, [x0, #576] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v13.4s -str q21, [x0, #624] -mla v23.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v13.4s -str q20, [x0, #608] -sqrdmulh v17.4S, v25.4S, v30.s[0] -sub v19.4s, v7.4s, v4.4s -mul v25.4S, v25.4S,v15.s[0] -str q19, [x0, #656] -ldr q19, [x0, #816] -sqrdmulh v20.4S, v19.4S, v30.s[0] -add v7.4s, v7.4s, v4.4s -mul v19.4S, v19.4S,v15.s[0] -str q7, [x0, #640] -ldr q7, [x17, #+544] -ldr q4, [x17, #+560] -ldr q13, [x0, #864] -sqrdmulh v22.4S, v13.4S, v4.s[0] -sub v21.4s, v9.4s, v14.4s -mul v13.4S, v13.4S,v7.s[0] -str q21, [x0, #688] -ldr q21, [x0, #880] -sqrdmulh v10.4S, v21.4S, v4.s[0] -add v9.4s, v9.4s, v14.4s -mul v21.4S, v21.4S,v7.s[0] -str q9, [x0, #672] -ldr q9, [x17, #+576] -ldr q14, [x17, #+592] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v12.4s -sqrdmulh v18.4S, v29.4S, v14.s[0] -str q17, [x0, #720] -ldr q17, [x0, #944] -mla v19.4S, v20.4S, v31.s[0] -add v8.4s, v8.4s, v12.4s -sqrdmulh v12.4S, v17.4S, v14.s[0] -str q8, [x0, #704] -ldr q8, [x17, #+608] -ldr q20, [x17, #+624] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v28.4s, v23.4s -sqrdmulh v6.4S, v27.4S, v20.s[0] -str q22, [x0, #752] -ldr q22, [x0, #1008] -mla v21.4S, v10.4S, v31.s[0] -add v28.4s, v28.4s, v23.4s -sqrdmulh v23.4S, v22.4S, v20.s[0] -str q28, [x0, #736] -ldr q28, [x0, #768] -ldr q10, [x0, #896] -mul v29.4S, v29.4S,v9.s[0] -sub v3.4s, v28.4s, v25.4s -ldr q24, [x0, #784] -mul v17.4S, v17.4S,v9.s[0] -add v28.4s, v28.4s, v25.4s -ldr q25, [x0, #912] -mla v29.4S, v18.4S, v31.s[0] -sub v18.4s, v24.4s, v19.4s -ldr q1, [x0, #832] -mla v17.4S, v12.4S, v31.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #960] -mul v27.4S, v27.4S,v8.s[0] -sub v12.4s, v1.4s, v13.4s -ldr q26, [x0, #848] -mul v22.4S, v22.4S,v8.s[0] -add v1.4s, v1.4s, v13.4s -ldr q13, [x0, #976] -mla v27.4S, v6.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v26.4s, v21.4s -sqrdmulh v6.4S, v24.4S, v30.s[1] -add v26.4s, v26.4s, v21.4s -mul v24.4S, v24.4S,v15.s[1] -sqrdmulh v21.4S, v18.4S, v30.s[2] -sub v5.4s, v10.4s, v29.4s -mul v18.4S, v18.4S,v15.s[2] -add v10.4s, v10.4s, v29.4s -sqrdmulh v30.4S, v26.4S, v4.s[1] -sub v15.4s, v25.4s, v17.4s -mul v26.4S, v26.4S,v7.s[1] -add v25.4s, v25.4s, v17.4s -sqrdmulh v17.4S, v23.4S, v4.s[2] -sub v29.4s, v19.4s, v27.4s -mul v23.4S, v23.4S,v7.s[2] -add v19.4s, v19.4s, v27.4s -mla v24.4S, v6.4S, v31.s[0] -sub v6.4s, v13.4s, v22.4s -sqrdmulh v4.4S, v25.4S, v14.s[1] -add v13.4s, v13.4s, v22.4s -mla v18.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v14.s[2] -sub v22.4s, v28.4s, v24.4s -mla v26.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v13.4S, v20.s[1] -add v28.4s, v28.4s, v24.4s -str q22, [x0, #784] -mla v23.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v6.4S, v20.s[2] -sub v22.4s, v3.4s, v18.4s -str q28, [x0, #768] -mul v25.4S, v25.4S,v9.s[1] -add v3.4s, v3.4s, v18.4s -mul v15.4S, v15.4S,v9.s[2] -str q22, [x0, #816] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v1.4s, v26.4s -mla v15.4S, v21.4S, v31.s[0] -str q3, [x0, #800] -mul v13.4S, v13.4S,v8.s[1] -str q4, [x0, #848] -mul v6.4S, v6.4S,v8.s[2] -add v1.4s, v1.4s, v26.4s -str q1, [x0, #832] -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v12.4s, v23.4s -str q30, [x0, #880] -mla v6.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v23.4s -str q12, [x0, #864] -sub v20.4s, v10.4s, v25.4s -str q20, [x0, #912] -add v10.4s, v10.4s, v25.4s -str q10, [x0, #896] -sub v10.4s, v5.4s, v15.4s -str q10, [x0, #944] -add v5.4s, v5.4s, v15.4s -str q5, [x0, #928] -sub v5.4s, v19.4s, v13.4s -str q5, [x0, #976] -add v19.4s, v19.4s, v13.4s -str q19, [x0, #960] -sub v19.4s, v29.4s, v6.4s -str q19, [x0, #1008] -add v29.4s, v29.4s, v6.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s deleted file mode 100644 index b2cda5a..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_12_z4_7.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_12_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #928] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #992] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #800] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #864] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #544] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #608] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -ldr q18, [x0, #672] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -ldr q17, [x0, #736] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #288] -ldr q1, [x0, #352] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #32] -ldr q25, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #160] -ldr q15, [x0, #224] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v28.s[1] -mul v16.4S, v16.4S,v29.s[1] -sqrdmulh v21.4S, v3.4S, v28.s[1] -sub v14.4s, v26.4s, v20.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v30.4S, v30.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v1.4S, v1.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v2.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v3.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v27.4S, v28.s[2] -mla v30.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v0.4S, v28.s[2] -mla v1.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v23.4S, v28.s[2] -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v2.4S, v2.4S,v29.s[2] -sub v10.4s, v24.4s, v16.4s -mul v27.4S, v27.4S,v29.s[2] -add v24.4s, v24.4s, v16.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v3.4s -mla v27.4S, v21.4S, v31.s[0] -add v15.4s, v15.4s, v3.4s -mul v0.4S, v0.4S,v29.s[2] -sub v3.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v29.s[2] -add v26.4s, v26.4s, v30.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v1.4s -mla v23.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v11.s[1] -mul v10.4S, v10.4S,v17.s[1] -sqrdmulh v19.4S, v18.4S, v11.s[1] -sub v30.4s, v12.4s, v2.4s -mul v18.4S, v18.4S,v17.s[1] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v11.s[0] -sub v21.4s, v22.4s, v27.4s -mul v24.4S, v24.4S,v17.s[0] -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v11.s[0] -sub v16.4s, v14.4s, v0.4s -mul v15.4S, v15.4S,v17.s[0] -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+64] -ldr q9, [x17, #+80] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v23.4s -sqrdmulh v8.4S, v12.4S, v11.s[2] -add v13.4s, v13.4s, v23.4s -mla v18.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v22.4S, v11.s[2] -mla v24.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v30.4S, v11.s[3] -mla v15.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v21.4S, v11.s[3] -ldr q23, [x17, #+96] -ldr q7, [x17, #+112] -mul v12.4S, v12.4S,v17.s[2] -sub v6.4s, v3.4s, v10.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v10.4s -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v22.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v30.4S, v30.4S,v17.s[3] -sub v18.4s, v26.4s, v24.4s -mul v21.4S, v21.4S,v17.s[3] -add v26.4s, v26.4s, v24.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v15.4s -mla v21.4S, v27.4S, v31.s[0] -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v9.s[2] -mul v20.4S, v20.4S,v0.s[2] -sqrdmulh v27.4S, v8.4S, v9.s[3] -sub v24.4s, v14.4s, v12.4s -mul v8.4S, v8.4S,v0.s[3] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v9.s[1] -sub v19.4s, v13.4s, v22.4s -mul v2.4S, v2.4S,v0.s[1] -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[0] -sub v10.4s, v16.4s, v30.4s -mul v25.4S, v25.4S,v0.s[0] -add v16.4s, v16.4s, v30.4s -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -sqrdmulh v30.4S, v13.4S, v7.s[0] -add v1.4s, v1.4s, v21.4s -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v20.4s -sqrdmulh v21.4S, v19.4S, v7.s[1] -add v3.4s, v3.4s, v20.4s -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v20.4S, v1.4S, v7.s[2] -add v6.4s, v6.4s, v8.4s -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v2.4s -sqrdmulh v8.4S, v15.4S, v7.s[3] -add v18.4s, v18.4s, v2.4s -mul v13.4S, v13.4S,v23.s[0] -sub v2.4s, v26.4s, v25.4s -mul v19.4S, v19.4S,v23.s[1] -add v26.4s, v26.4s, v25.4s -mla v13.4S, v30.4S, v31.s[0] -str q27, [x0, #352] -mla v19.4S, v21.4S, v31.s[0] -str q3, [x0, #288] -mul v1.4S, v1.4S,v23.s[2] -str q12, [x0, #480] -mul v15.4S, v15.4S,v23.s[3] -str q6, [x0, #416] -mla v1.4S, v20.4S, v31.s[0] -str q22, [x0, #224] -mla v15.4S, v8.4S, v31.s[0] -str q18, [x0, #160] -ldr q18, [x0, #944] -sqrdmulh v8.4S, v18.4S, v28.s[0] -str q2, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -str q26, [x0, #32] -ldr q26, [x0, #1008] -sqrdmulh v2.4S, v26.4S, v28.s[0] -sub v22.4s, v14.4s, v13.4s -str q22, [x0, #608] -mul v26.4S, v26.4S,v29.s[0] -add v14.4s, v14.4s, v13.4s -ldr q13, [x0, #816] -sqrdmulh v22.4S, v13.4S, v28.s[0] -sub v20.4s, v24.4s, v19.4s -str q14, [x0, #544] -mul v13.4S, v13.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #880] -sqrdmulh v14.4S, v19.4S, v28.s[0] -sub v6.4s, v16.4s, v1.4s -str q20, [x0, #736] -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v1.4s -ldr q1, [x0, #560] -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v15.4s -str q24, [x0, #672] -sqrdmulh v24.4S, v1.4S, v28.s[0] -add v10.4s, v10.4s, v15.4s -ldr q15, [x0, #624] -mla v26.4S, v2.4S, v31.s[0] -str q6, [x0, #864] -sqrdmulh v6.4S, v15.4S, v28.s[0] -ldr q2, [x0, #688] -mla v13.4S, v22.4S, v31.s[0] -str q16, [x0, #800] -sqrdmulh v16.4S, v2.4S, v28.s[0] -ldr q22, [x0, #752] -mla v19.4S, v14.4S, v31.s[0] -str q8, [x0, #992] -sqrdmulh v8.4S, v22.4S, v28.s[0] -ldr q14, [x0, #432] -ldr q20, [x0, #496] -mul v1.4S, v1.4S,v29.s[0] -sub v12.4s, v14.4s, v18.4s -str q10, [x0, #928] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v18.4s -ldr q18, [x0, #304] -ldr q10, [x0, #368] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v26.4s -mla v15.4S, v6.4S, v31.s[0] -add v20.4s, v20.4s, v26.4s -ldr q26, [x0, #48] -ldr q6, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -sub v3.4s, v18.4s, v13.4s -mul v22.4S, v22.4S,v29.s[0] -add v18.4s, v18.4s, v13.4s -ldr q13, [x0, #176] -ldr q21, [x0, #240] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v19.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v28.s[1] -mul v14.4S, v14.4S,v29.s[1] -sqrdmulh v8.4S, v20.4S, v28.s[1] -sub v27.4s, v26.4s, v1.4s -mul v20.4S, v20.4S,v29.s[1] -add v26.4s, v26.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v28.s[1] -sub v30.4s, v6.4s, v15.4s -mul v18.4S, v18.4S,v29.s[1] -add v6.4s, v6.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v28.s[1] -sub v25.4s, v13.4s, v2.4s -mul v10.4S, v10.4S,v29.s[1] -add v13.4s, v13.4s, v2.4s -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v22.4s -sqrdmulh v2.4S, v12.4S, v28.s[2] -add v21.4s, v21.4s, v22.4s -mla v20.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v24.4S, v28.s[2] -mla v18.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v3.4S, v28.s[2] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v16.4S, v28.s[2] -mul v12.4S, v12.4S,v29.s[2] -sub v22.4s, v13.4s, v14.4s -mul v24.4S, v24.4S,v29.s[2] -add v13.4s, v13.4s, v14.4s -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v20.4s -mla v24.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -mul v3.4S, v3.4S,v29.s[2] -sub v20.4s, v26.4s, v18.4s -mul v16.4S, v16.4S,v29.s[2] -add v26.4s, v26.4s, v18.4s -mla v3.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v11.s[1] -mul v22.4S, v22.4S,v17.s[1] -sqrdmulh v15.4S, v2.4S, v11.s[1] -sub v18.4s, v25.4s, v12.4s -mul v2.4S, v2.4S,v17.s[1] -add v25.4s, v25.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v11.s[0] -sub v8.4s, v19.4s, v24.4s -mul v13.4S, v13.4S,v17.s[0] -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v11.s[0] -sub v14.4s, v27.4s, v3.4s -mul v21.4S, v21.4S,v17.s[0] -add v27.4s, v27.4s, v3.4s -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v16.4s -sqrdmulh v3.4S, v25.4S, v11.s[2] -add v30.4s, v30.4s, v16.4s -mla v2.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v19.4S, v11.s[2] -mla v13.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v18.4S, v11.s[3] -mla v21.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v8.4S, v11.s[3] -mul v25.4S, v25.4S,v17.s[2] -sub v16.4s, v20.4s, v22.4s -mul v19.4S, v19.4S,v17.s[2] -add v20.4s, v20.4s, v22.4s -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v2.4s -mla v19.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v2.4s -mul v18.4S, v18.4S,v17.s[3] -sub v2.4s, v26.4s, v13.4s -mul v8.4S, v8.4S,v17.s[3] -add v26.4s, v26.4s, v13.4s -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v21.4s -mla v8.4S, v24.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v9.s[2] -mul v1.4S, v1.4S,v0.s[2] -sqrdmulh v24.4S, v3.4S, v9.s[3] -sub v13.4s, v27.4s, v25.4s -mul v3.4S, v3.4S,v0.s[3] -add v27.4s, v27.4s, v25.4s -sqrdmulh v25.4S, v12.4S, v9.s[1] -sub v15.4s, v30.4s, v19.4s -mul v12.4S, v12.4S,v0.s[1] -add v30.4s, v30.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[0] -sub v22.4s, v14.4s, v18.4s -mul v6.4S, v6.4S,v0.s[0] -add v14.4s, v14.4s, v18.4s -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v8.4s -sqrdmulh v18.4S, v30.4S, v7.s[0] -add v10.4s, v10.4s, v8.4s -mla v3.4S, v24.4S, v31.s[0] -sub v24.4s, v20.4s, v1.4s -sqrdmulh v8.4S, v15.4S, v7.s[1] -add v20.4s, v20.4s, v1.4s -mla v12.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v3.4s -sqrdmulh v1.4S, v10.4S, v7.s[2] -add v16.4s, v16.4s, v3.4s -mla v6.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v12.4s -sqrdmulh v3.4S, v21.4S, v7.s[3] -add v2.4s, v2.4s, v12.4s -mul v30.4S, v30.4S,v23.s[0] -sub v12.4s, v26.4s, v6.4s -mul v15.4S, v15.4S,v23.s[1] -add v26.4s, v26.4s, v6.4s -mla v30.4S, v18.4S, v31.s[0] -str q24, [x0, #368] -mla v15.4S, v8.4S, v31.s[0] -str q20, [x0, #304] -mul v10.4S, v10.4S,v23.s[2] -str q25, [x0, #496] -mul v21.4S, v21.4S,v23.s[3] -str q16, [x0, #432] -mla v10.4S, v1.4S, v31.s[0] -str q19, [x0, #240] -mla v21.4S, v3.4S, v31.s[0] -str q2, [x0, #176] -ldr q2, [x0, #896] -sqrdmulh v3.4S, v2.4S, v28.s[0] -str q12, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -str q26, [x0, #48] -ldr q26, [x0, #960] -sqrdmulh v12.4S, v26.4S, v28.s[0] -sub v19.4s, v27.4s, v30.4s -str q19, [x0, #624] -mul v26.4S, v26.4S,v29.s[0] -add v27.4s, v27.4s, v30.4s -ldr q30, [x0, #768] -sqrdmulh v19.4S, v30.4S, v28.s[0] -sub v1.4s, v13.4s, v15.4s -str q27, [x0, #560] -mul v30.4S, v30.4S,v29.s[0] -add v13.4s, v13.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v27.4S, v15.4S, v28.s[0] -sub v16.4s, v14.4s, v10.4s -str q1, [x0, #752] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v10.4s -ldr q10, [x0, #512] -mla v2.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v21.4s -str q13, [x0, #688] -sqrdmulh v13.4S, v10.4S, v28.s[0] -add v22.4s, v22.4s, v21.4s -ldr q21, [x0, #576] -mla v26.4S, v12.4S, v31.s[0] -str q16, [x0, #880] -sqrdmulh v16.4S, v21.4S, v28.s[0] -ldr q12, [x0, #640] -mla v30.4S, v19.4S, v31.s[0] -str q14, [x0, #816] -sqrdmulh v14.4S, v12.4S, v28.s[0] -ldr q19, [x0, #704] -mla v15.4S, v27.4S, v31.s[0] -str q3, [x0, #1008] -sqrdmulh v3.4S, v19.4S, v28.s[0] -ldr q27, [x0, #384] -ldr q1, [x0, #448] -mul v10.4S, v10.4S,v29.s[0] -sub v25.4s, v27.4s, v2.4s -str q22, [x0, #944] -mul v21.4S, v21.4S,v29.s[0] -add v27.4s, v27.4s, v2.4s -ldr q2, [x0, #256] -ldr q22, [x0, #320] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v26.4s -mla v21.4S, v16.4S, v31.s[0] -add v1.4s, v1.4s, v26.4s -ldr q26, [x0, #0] -ldr q16, [x0, #64] -mul v12.4S, v12.4S,v29.s[0] -sub v20.4s, v2.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v2.4s, v2.4s, v30.4s -ldr q30, [x0, #128] -ldr q8, [x0, #192] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -mla v19.4S, v3.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v27.4S, v28.s[1] -mul v27.4S, v27.4S,v29.s[1] -sqrdmulh v3.4S, v1.4S, v28.s[1] -sub v24.4s, v26.4s, v10.4s -mul v1.4S, v1.4S,v29.s[1] -add v26.4s, v26.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v28.s[1] -sub v18.4s, v16.4s, v21.4s -mul v2.4S, v2.4S,v29.s[1] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v28.s[1] -sub v6.4s, v30.4s, v12.4s -mul v22.4S, v22.4S,v29.s[1] -add v30.4s, v30.4s, v12.4s -mla v27.4S, v15.4S, v31.s[0] -sub v15.4s, v8.4s, v19.4s -sqrdmulh v12.4S, v25.4S, v28.s[2] -add v8.4s, v8.4s, v19.4s -mla v1.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v13.4S, v28.s[2] -mla v2.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v20.4S, v28.s[2] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v14.4S, v28.s[2] -mul v25.4S, v25.4S,v29.s[2] -sub v19.4s, v30.4s, v27.4s -mul v13.4S, v13.4S,v29.s[2] -add v30.4s, v30.4s, v27.4s -mla v25.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v1.4s -mla v13.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v1.4s -mul v20.4S, v20.4S,v29.s[2] -sub v1.4s, v26.4s, v2.4s -mul v14.4S, v14.4S,v29.s[2] -add v26.4s, v26.4s, v2.4s -mla v20.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v22.4s -mla v14.4S, v21.4S, v31.s[0] -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v11.s[1] -mul v19.4S, v19.4S,v17.s[1] -sqrdmulh v21.4S, v12.4S, v11.s[1] -sub v2.4s, v6.4s, v25.4s -mul v12.4S, v12.4S,v17.s[1] -add v6.4s, v6.4s, v25.4s -sqrdmulh v25.4S, v30.4S, v11.s[0] -sub v3.4s, v15.4s, v13.4s -mul v30.4S, v30.4S,v17.s[0] -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v11.s[0] -sub v27.4s, v24.4s, v20.4s -mul v8.4S, v8.4S,v17.s[0] -add v24.4s, v24.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v14.4s -sqrdmulh v20.4S, v6.4S, v11.s[2] -add v18.4s, v18.4s, v14.4s -mla v12.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v11.s[2] -mla v30.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v2.4S, v11.s[3] -mla v8.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v3.4S, v11.s[3] -mul v6.4S, v6.4S,v17.s[2] -sub v14.4s, v1.4s, v19.4s -mul v15.4S, v15.4S,v17.s[2] -add v1.4s, v1.4s, v19.4s -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v10.4s, v12.4s -mla v15.4S, v21.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -mul v2.4S, v2.4S,v17.s[3] -sub v12.4s, v26.4s, v30.4s -mul v3.4S, v3.4S,v17.s[3] -add v26.4s, v26.4s, v30.4s -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v8.4s -mla v3.4S, v13.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v9.s[2] -mul v10.4S, v10.4S,v0.s[2] -sqrdmulh v13.4S, v20.4S, v9.s[3] -sub v30.4s, v24.4s, v6.4s -mul v20.4S, v20.4S,v0.s[3] -add v24.4s, v24.4s, v6.4s -sqrdmulh v6.4S, v25.4S, v9.s[1] -sub v21.4s, v18.4s, v15.4s -mul v25.4S, v25.4S,v0.s[1] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v9.s[0] -sub v19.4s, v27.4s, v2.4s -mul v16.4S, v16.4S,v0.s[0] -add v27.4s, v27.4s, v2.4s -mla v10.4S, v8.4S, v31.s[0] -sub v8.4s, v22.4s, v3.4s -sqrdmulh v2.4S, v18.4S, v7.s[0] -add v22.4s, v22.4s, v3.4s -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v10.4s -sqrdmulh v3.4S, v21.4S, v7.s[1] -add v1.4s, v1.4s, v10.4s -mla v25.4S, v6.4S, v31.s[0] -sub v6.4s, v14.4s, v20.4s -sqrdmulh v10.4S, v22.4S, v7.s[2] -add v14.4s, v14.4s, v20.4s -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v25.4s -sqrdmulh v20.4S, v8.4S, v7.s[3] -add v12.4s, v12.4s, v25.4s -mul v18.4S, v18.4S,v23.s[0] -sub v25.4s, v26.4s, v16.4s -mul v21.4S, v21.4S,v23.s[1] -add v26.4s, v26.4s, v16.4s -mla v18.4S, v2.4S, v31.s[0] -str q13, [x0, #320] -mla v21.4S, v3.4S, v31.s[0] -str q1, [x0, #256] -mul v22.4S, v22.4S,v23.s[2] -str q6, [x0, #448] -mul v8.4S, v8.4S,v23.s[3] -str q14, [x0, #384] -mla v22.4S, v10.4S, v31.s[0] -str q15, [x0, #192] -mla v8.4S, v20.4S, v31.s[0] -str q12, [x0, #128] -ldr q12, [x0, #912] -sqrdmulh v20.4S, v12.4S, v28.s[0] -str q25, [x0, #64] -mul v12.4S, v12.4S,v29.s[0] -str q26, [x0, #0] -ldr q26, [x0, #976] -sqrdmulh v25.4S, v26.4S, v28.s[0] -sub v15.4s, v24.4s, v18.4s -str q15, [x0, #576] -mul v26.4S, v26.4S,v29.s[0] -add v24.4s, v24.4s, v18.4s -ldr q18, [x0, #784] -sqrdmulh v15.4S, v18.4S, v28.s[0] -sub v10.4s, v30.4s, v21.4s -str q24, [x0, #512] -mul v18.4S, v18.4S,v29.s[0] -add v30.4s, v30.4s, v21.4s -ldr q21, [x0, #848] -sqrdmulh v24.4S, v21.4S, v28.s[0] -sub v14.4s, v27.4s, v22.4s -str q10, [x0, #704] -mul v21.4S, v21.4S,v29.s[0] -add v27.4s, v27.4s, v22.4s -ldr q22, [x0, #528] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v8.4s -str q30, [x0, #640] -sqrdmulh v30.4S, v22.4S, v28.s[0] -add v19.4s, v19.4s, v8.4s -ldr q8, [x0, #592] -mla v26.4S, v25.4S, v31.s[0] -str q14, [x0, #832] -sqrdmulh v14.4S, v8.4S, v28.s[0] -ldr q25, [x0, #656] -mla v18.4S, v15.4S, v31.s[0] -str q27, [x0, #768] -sqrdmulh v27.4S, v25.4S, v28.s[0] -ldr q15, [x0, #720] -mla v21.4S, v24.4S, v31.s[0] -str q20, [x0, #960] -sqrdmulh v20.4S, v15.4S, v28.s[0] -ldr q24, [x0, #400] -ldr q10, [x0, #464] -mul v22.4S, v22.4S,v29.s[0] -sub v6.4s, v24.4s, v12.4s -str q19, [x0, #896] -mul v8.4S, v8.4S,v29.s[0] -add v24.4s, v24.4s, v12.4s -ldr q12, [x0, #272] -ldr q19, [x0, #336] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v26.4s -mla v8.4S, v14.4S, v31.s[0] -add v10.4s, v10.4s, v26.4s -ldr q26, [x0, #16] -ldr q14, [x0, #80] -mul v25.4S, v25.4S,v29.s[0] -sub v1.4s, v12.4s, v18.4s -mul v15.4S, v15.4S,v29.s[0] -add v12.4s, v12.4s, v18.4s -ldr q18, [x0, #144] -ldr q3, [x0, #208] -mla v25.4S, v27.4S, v31.s[0] -sub v27.4s, v19.4s, v21.4s -mla v15.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v24.4S, v28.s[1] -mul v24.4S, v24.4S,v29.s[1] -sqrdmulh v20.4S, v10.4S, v28.s[1] -sub v13.4s, v26.4s, v22.4s -mul v10.4S, v10.4S,v29.s[1] -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v28.s[1] -sub v2.4s, v14.4s, v8.4s -mul v12.4S, v12.4S,v29.s[1] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v28.s[1] -sub v16.4s, v18.4s, v25.4s -mul v19.4S, v19.4S,v29.s[1] -add v18.4s, v18.4s, v25.4s -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v15.4s -sqrdmulh v25.4S, v6.4S, v28.s[2] -add v3.4s, v3.4s, v15.4s -mla v10.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v30.4S, v28.s[2] -mla v12.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v1.4S, v28.s[2] -mla v19.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v27.4S, v28.s[2] -mul v6.4S, v6.4S,v29.s[2] -sub v15.4s, v18.4s, v24.4s -mul v30.4S, v30.4S,v29.s[2] -add v18.4s, v18.4s, v24.4s -mla v6.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v10.4s -mla v30.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -mul v1.4S, v1.4S,v29.s[2] -sub v10.4s, v26.4s, v12.4s -mul v27.4S, v27.4S,v29.s[2] -add v26.4s, v26.4s, v12.4s -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v19.4s -mla v27.4S, v8.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v28.4S, v15.4S, v11.s[1] -mul v15.4S, v15.4S,v17.s[1] -sqrdmulh v29.4S, v25.4S, v11.s[1] -sub v19.4s, v16.4s, v6.4s -mul v25.4S, v25.4S,v17.s[1] -add v16.4s, v16.4s, v6.4s -sqrdmulh v6.4S, v18.4S, v11.s[0] -sub v8.4s, v21.4s, v30.4s -mul v18.4S, v18.4S,v17.s[0] -add v21.4s, v21.4s, v30.4s -sqrdmulh v30.4S, v3.4S, v11.s[0] -sub v12.4s, v13.4s, v1.4s -mul v3.4S, v3.4S,v17.s[0] -add v13.4s, v13.4s, v1.4s -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v2.4s, v27.4s -sqrdmulh v1.4S, v16.4S, v11.s[2] -add v2.4s, v2.4s, v27.4s -mla v25.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v21.4S, v11.s[2] -mla v18.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v19.4S, v11.s[3] -mla v3.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v8.4S, v11.s[3] -mul v16.4S, v16.4S,v17.s[2] -sub v27.4s, v10.4s, v15.4s -mul v21.4S, v21.4S,v17.s[2] -add v10.4s, v10.4s, v15.4s -mla v16.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v25.4s -mla v21.4S, v29.4S, v31.s[0] -add v22.4s, v22.4s, v25.4s -mul v19.4S, v19.4S,v17.s[3] -sub v25.4s, v26.4s, v18.4s -mul v8.4S, v8.4S,v17.s[3] -add v26.4s, v26.4s, v18.4s -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v14.4s, v3.4s -mla v8.4S, v30.4S, v31.s[0] -add v14.4s, v14.4s, v3.4s -sqrdmulh v11.4S, v22.4S, v9.s[2] -mul v22.4S, v22.4S,v0.s[2] -sqrdmulh v17.4S, v1.4S, v9.s[3] -sub v3.4s, v13.4s, v16.4s -mul v1.4S, v1.4S,v0.s[3] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v9.s[1] -sub v30.4s, v2.4s, v21.4s -mul v6.4S, v6.4S,v0.s[1] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v9.s[0] -sub v18.4s, v12.4s, v19.4s -mul v14.4S, v14.4S,v0.s[0] -add v12.4s, v12.4s, v19.4s -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v8.4s -sqrdmulh v9.4S, v2.4S, v7.s[0] -add v28.4s, v28.4s, v8.4s -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v22.4s -sqrdmulh v8.4S, v30.4S, v7.s[1] -add v10.4s, v10.4s, v22.4s -mla v6.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v1.4s -sqrdmulh v22.4S, v28.4S, v7.s[2] -add v27.4s, v27.4s, v1.4s -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v25.4s, v6.4s -sqrdmulh v1.4S, v11.4S, v7.s[3] -add v25.4s, v25.4s, v6.4s -mul v2.4S, v2.4S,v23.s[0] -sub v6.4s, v26.4s, v14.4s -mul v30.4S, v30.4S,v23.s[1] -add v26.4s, v26.4s, v14.4s -mla v2.4S, v9.4S, v31.s[0] -str q17, [x0, #336] -mla v30.4S, v8.4S, v31.s[0] -str q10, [x0, #272] -mul v28.4S, v28.4S,v23.s[2] -str q16, [x0, #464] -mul v11.4S, v11.4S,v23.s[3] -str q27, [x0, #400] -mla v28.4S, v22.4S, v31.s[0] -str q21, [x0, #208] -mla v11.4S, v1.4S, v31.s[0] -str q25, [x0, #144] -str q6, [x0, #80] -str q26, [x0, #16] -sub v26.4s, v13.4s, v2.4s -str q26, [x0, #592] -add v13.4s, v13.4s, v2.4s -sub v2.4s, v3.4s, v30.4s -str q13, [x0, #528] -add v3.4s, v3.4s, v30.4s -sub v30.4s, v12.4s, v28.4s -str q2, [x0, #720] -add v12.4s, v12.4s, v28.4s -sub v28.4s, v18.4s, v11.4s -str q3, [x0, #656] -add v18.4s, v18.4s, v11.4s -str q30, [x0, #848] -str q12, [x0, #784] -str q28, [x0, #976] -str q18, [x0, #912] -ldr q4, [x0, #224] -ldr q5, [x0, #160] -ldr q24, [x0, #32] -ldr q20, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v29.4S, v24.4S, v15.s[0] -mul v24.4S, v24.4S,v20.s[0] -ldr q19, [x0, #48] -sqrdmulh v0.4S, v19.4S, v15.s[0] -mul v19.4S, v19.4S,v20.s[0] -ldr q14, [x17, #+160] -ldr q9, [x17, #+176] -ldr q17, [x0, #96] -sqrdmulh v8.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v14.s[0] -ldr q10, [x0, #112] -sqrdmulh v16.4S, v10.4S, v9.s[0] -mul v10.4S, v10.4S,v14.s[0] -ldr q27, [x17, #+192] -ldr q22, [x17, #+208] -mla v24.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v5.4S, v22.s[0] -ldr q21, [x0, #176] -mla v19.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v21.4S, v22.s[0] -ldr q1, [x17, #+224] -ldr q25, [x17, #+240] -mla v17.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v4.4S, v25.s[0] -ldr q23, [x0, #240] -mla v10.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v23.4S, v25.s[0] -ldr q7, [x0, #0] -ldr q6, [x0, #128] -mul v5.4S, v5.4S,v27.s[0] -sub v26.4s, v7.4s, v24.4s -ldr q13, [x0, #16] -mul v21.4S, v21.4S,v27.s[0] -add v7.4s, v7.4s, v24.4s -ldr q24, [x0, #144] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v19.4s -ldr q2, [x0, #64] -mla v21.4S, v0.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -ldr q19, [x0, #192] -mul v4.4S, v4.4S,v1.s[0] -sub v0.4s, v2.4s, v17.4s -ldr q3, [x0, #80] -mul v23.4S, v23.4S,v1.s[0] -add v2.4s, v2.4s, v17.4s -ldr q17, [x0, #208] -mla v4.4S, v8.4S, v31.s[0] -mla v23.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v10.4s -sqrdmulh v8.4S, v13.4S, v15.s[1] -add v3.4s, v3.4s, v10.4s -mul v13.4S, v13.4S,v20.s[1] -sqrdmulh v10.4S, v29.4S, v15.s[2] -sub v11.4s, v6.4s, v5.4s -mul v29.4S, v29.4S,v20.s[2] -add v6.4s, v6.4s, v5.4s -sqrdmulh v15.4S, v3.4S, v9.s[1] -sub v20.4s, v24.4s, v21.4s -mul v3.4S, v3.4S,v14.s[1] -add v24.4s, v24.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v9.s[2] -sub v5.4s, v19.4s, v4.4s -mul v16.4S, v16.4S,v14.s[2] -add v19.4s, v19.4s, v4.4s -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v23.4s -ldr q9, [x0, #480] -sqrdmulh v14.4S, v24.4S, v22.s[1] -add v17.4s, v17.4s, v23.4s -mla v29.4S, v10.4S, v31.s[0] -ldr q10, [x0, #416] -sqrdmulh v23.4S, v20.4S, v22.s[2] -sub v4.4s, v7.4s, v13.4s -mla v3.4S, v15.4S, v31.s[0] -ldr q15, [x0, #288] -sqrdmulh v30.4S, v17.4S, v25.s[1] -add v7.4s, v7.4s, v13.4s -str q4, [x0, #16] -mla v16.4S, v21.4S, v31.s[0] -ldr q21, [x17, #+256] -ldr q4, [x17, #+272] -sqrdmulh v13.4S, v8.4S, v25.s[2] -sub v12.4s, v26.4s, v29.4s -str q7, [x0, #0] -mul v24.4S, v24.4S,v27.s[1] -add v26.4s, v26.4s, v29.4s -mul v20.4S, v20.4S,v27.s[2] -str q12, [x0, #48] -mla v24.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v3.4s -mla v20.4S, v23.4S, v31.s[0] -str q26, [x0, #32] -mul v17.4S, v17.4S,v1.s[1] -str q14, [x0, #80] -mul v8.4S, v8.4S,v1.s[2] -add v2.4s, v2.4s, v3.4s -str q2, [x0, #64] -mla v17.4S, v30.4S, v31.s[0] -sub v30.4s, v0.4s, v16.4s -str q30, [x0, #112] -mla v8.4S, v13.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q0, [x0, #96] -sqrdmulh v25.4S, v15.4S, v4.s[0] -sub v1.4s, v6.4s, v24.4s -mul v15.4S, v15.4S,v21.s[0] -str q1, [x0, #144] -ldr q1, [x0, #304] -sqrdmulh v0.4S, v1.4S, v4.s[0] -add v6.4s, v6.4s, v24.4s -mul v1.4S, v1.4S,v21.s[0] -str q6, [x0, #128] -ldr q6, [x17, #+288] -ldr q24, [x17, #+304] -ldr q16, [x0, #352] -sqrdmulh v13.4S, v16.4S, v24.s[0] -sub v30.4s, v11.4s, v20.4s -mul v16.4S, v16.4S,v6.s[0] -str q30, [x0, #176] -ldr q30, [x0, #368] -sqrdmulh v2.4S, v30.4S, v24.s[0] -add v11.4s, v11.4s, v20.4s -mul v30.4S, v30.4S,v6.s[0] -str q11, [x0, #160] -ldr q11, [x17, #+320] -ldr q20, [x17, #+336] -mla v15.4S, v25.4S, v31.s[0] -sub v25.4s, v19.4s, v17.4s -sqrdmulh v3.4S, v10.4S, v20.s[0] -str q25, [x0, #208] -ldr q25, [x0, #432] -mla v1.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v20.s[0] -str q19, [x0, #192] -ldr q19, [x17, #+352] -ldr q0, [x17, #+368] -mla v16.4S, v13.4S, v31.s[0] -sub v13.4s, v5.4s, v8.4s -sqrdmulh v14.4S, v9.4S, v0.s[0] -str q13, [x0, #240] -ldr q13, [x0, #496] -mla v30.4S, v2.4S, v31.s[0] -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v13.4S, v0.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q2, [x0, #384] -mul v10.4S, v10.4S,v11.s[0] -sub v22.4s, v5.4s, v15.4s -ldr q27, [x0, #272] -mul v25.4S, v25.4S,v11.s[0] -add v5.4s, v5.4s, v15.4s -ldr q15, [x0, #400] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v27.4s, v1.4s -ldr q26, [x0, #320] -mla v25.4S, v17.4S, v31.s[0] -add v27.4s, v27.4s, v1.4s -ldr q1, [x0, #448] -mul v9.4S, v9.4S,v19.s[0] -sub v17.4s, v26.4s, v16.4s -ldr q23, [x0, #336] -mul v13.4S, v13.4S,v19.s[0] -add v26.4s, v26.4s, v16.4s -ldr q16, [x0, #464] -mla v9.4S, v14.4S, v31.s[0] -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v23.4s, v30.4s -sqrdmulh v14.4S, v27.4S, v4.s[1] -add v23.4s, v23.4s, v30.4s -mul v27.4S, v27.4S,v21.s[1] -sqrdmulh v30.4S, v3.4S, v4.s[2] -sub v12.4s, v2.4s, v10.4s -mul v3.4S, v3.4S,v21.s[2] -add v2.4s, v2.4s, v10.4s -sqrdmulh v4.4S, v23.4S, v24.s[1] -sub v21.4s, v15.4s, v25.4s -mul v23.4S, v23.4S,v6.s[1] -add v15.4s, v15.4s, v25.4s -sqrdmulh v25.4S, v8.4S, v24.s[2] -sub v10.4s, v1.4s, v9.4s -mul v8.4S, v8.4S,v6.s[2] -add v1.4s, v1.4s, v9.4s -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v13.4s -ldr q24, [x0, #736] -sqrdmulh v6.4S, v15.4S, v20.s[1] -add v16.4s, v16.4s, v13.4s -mla v3.4S, v30.4S, v31.s[0] -ldr q30, [x0, #672] -sqrdmulh v13.4S, v21.4S, v20.s[2] -sub v9.4s, v5.4s, v27.4s -mla v23.4S, v4.4S, v31.s[0] -ldr q4, [x0, #544] -sqrdmulh v29.4S, v16.4S, v0.s[1] -add v5.4s, v5.4s, v27.4s -str q9, [x0, #272] -mla v8.4S, v25.4S, v31.s[0] -ldr q25, [x17, #+384] -ldr q9, [x17, #+400] -sqrdmulh v27.4S, v14.4S, v0.s[2] -sub v7.4s, v22.4s, v3.4s -str q5, [x0, #256] -mul v15.4S, v15.4S,v11.s[1] -add v22.4s, v22.4s, v3.4s -mul v21.4S, v21.4S,v11.s[2] -str q7, [x0, #304] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v23.4s -mla v21.4S, v13.4S, v31.s[0] -str q22, [x0, #288] -mul v16.4S, v16.4S,v19.s[1] -str q6, [x0, #336] -mul v14.4S, v14.4S,v19.s[2] -add v26.4s, v26.4s, v23.4s -str q26, [x0, #320] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v17.4s, v8.4s -str q29, [x0, #368] -mla v14.4S, v27.4S, v31.s[0] -add v17.4s, v17.4s, v8.4s -str q17, [x0, #352] -sqrdmulh v0.4S, v4.4S, v9.s[0] -sub v19.4s, v2.4s, v15.4s -mul v4.4S, v4.4S,v25.s[0] -str q19, [x0, #400] -ldr q19, [x0, #560] -sqrdmulh v17.4S, v19.4S, v9.s[0] -add v2.4s, v2.4s, v15.4s -mul v19.4S, v19.4S,v25.s[0] -str q2, [x0, #384] -ldr q2, [x17, #+416] -ldr q15, [x17, #+432] -ldr q8, [x0, #608] -sqrdmulh v27.4S, v8.4S, v15.s[0] -sub v29.4s, v12.4s, v21.4s -mul v8.4S, v8.4S,v2.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v26.4S, v29.4S, v15.s[0] -add v12.4s, v12.4s, v21.4s -mul v29.4S, v29.4S,v2.s[0] -str q12, [x0, #416] -ldr q12, [x17, #+448] -ldr q21, [x17, #+464] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v1.4s, v16.4s -sqrdmulh v23.4S, v30.4S, v21.s[0] -str q0, [x0, #464] -ldr q0, [x0, #688] -mla v19.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v0.4S, v21.s[0] -str q1, [x0, #448] -ldr q1, [x17, #+480] -ldr q17, [x17, #+496] -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v10.4s, v14.4s -sqrdmulh v6.4S, v24.4S, v17.s[0] -str q27, [x0, #496] -ldr q27, [x0, #752] -mla v29.4S, v26.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v27.4S, v17.s[0] -str q10, [x0, #480] -ldr q10, [x0, #512] -ldr q26, [x0, #640] -mul v30.4S, v30.4S,v12.s[0] -sub v20.4s, v10.4s, v4.4s -ldr q11, [x0, #528] -mul v0.4S, v0.4S,v12.s[0] -add v10.4s, v10.4s, v4.4s -ldr q4, [x0, #656] -mla v30.4S, v23.4S, v31.s[0] -sub v23.4s, v11.4s, v19.4s -ldr q22, [x0, #576] -mla v0.4S, v16.4S, v31.s[0] -add v11.4s, v11.4s, v19.4s -ldr q19, [x0, #704] -mul v24.4S, v24.4S,v1.s[0] -sub v16.4s, v22.4s, v8.4s -ldr q13, [x0, #592] -mul v27.4S, v27.4S,v1.s[0] -add v22.4s, v22.4s, v8.4s -ldr q8, [x0, #720] -mla v24.4S, v6.4S, v31.s[0] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v29.4s -sqrdmulh v6.4S, v11.4S, v9.s[1] -add v13.4s, v13.4s, v29.4s -mul v11.4S, v11.4S,v25.s[1] -sqrdmulh v29.4S, v23.4S, v9.s[2] -sub v7.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v25.s[2] -add v26.4s, v26.4s, v30.4s -sqrdmulh v9.4S, v13.4S, v15.s[1] -sub v25.4s, v4.4s, v0.4s -mul v13.4S, v13.4S,v2.s[1] -add v4.4s, v4.4s, v0.4s -sqrdmulh v0.4S, v14.4S, v15.s[2] -sub v30.4s, v19.4s, v24.4s -mul v14.4S, v14.4S,v2.s[2] -add v19.4s, v19.4s, v24.4s -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v27.4s -ldr q15, [x0, #992] -sqrdmulh v2.4S, v4.4S, v21.s[1] -add v8.4s, v8.4s, v27.4s -mla v23.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v27.4S, v25.4S, v21.s[2] -sub v24.4s, v10.4s, v11.4s -mla v13.4S, v9.4S, v31.s[0] -ldr q9, [x0, #800] -sqrdmulh v3.4S, v8.4S, v17.s[1] -add v10.4s, v10.4s, v11.4s -str q24, [x0, #528] -mla v14.4S, v0.4S, v31.s[0] -ldr q0, [x17, #+512] -ldr q24, [x17, #+528] -sqrdmulh v11.4S, v6.4S, v17.s[2] -sub v5.4s, v20.4s, v23.4s -str q10, [x0, #512] -mul v4.4S, v4.4S,v12.s[1] -add v20.4s, v20.4s, v23.4s -mul v25.4S, v25.4S,v12.s[2] -str q5, [x0, #560] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v22.4s, v13.4s -mla v25.4S, v27.4S, v31.s[0] -str q20, [x0, #544] -mul v8.4S, v8.4S,v1.s[1] -str q2, [x0, #592] -mul v6.4S, v6.4S,v1.s[2] -add v22.4s, v22.4s, v13.4s -str q22, [x0, #576] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v14.4s -str q3, [x0, #624] -mla v6.4S, v11.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -str q16, [x0, #608] -sqrdmulh v17.4S, v9.4S, v24.s[0] -sub v1.4s, v26.4s, v4.4s -mul v9.4S, v9.4S,v0.s[0] -str q1, [x0, #656] -ldr q1, [x0, #816] -sqrdmulh v16.4S, v1.4S, v24.s[0] -add v26.4s, v26.4s, v4.4s -mul v1.4S, v1.4S,v0.s[0] -str q26, [x0, #640] -ldr q26, [x17, #+544] -ldr q4, [x17, #+560] -ldr q14, [x0, #864] -sqrdmulh v11.4S, v14.4S, v4.s[0] -sub v3.4s, v7.4s, v25.4s -mul v14.4S, v14.4S,v26.s[0] -str q3, [x0, #688] -ldr q3, [x0, #880] -sqrdmulh v22.4S, v3.4S, v4.s[0] -add v7.4s, v7.4s, v25.4s -mul v3.4S, v3.4S,v26.s[0] -str q7, [x0, #672] -ldr q7, [x17, #+576] -ldr q25, [x17, #+592] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v8.4s -sqrdmulh v13.4S, v29.4S, v25.s[0] -str q17, [x0, #720] -ldr q17, [x0, #944] -mla v1.4S, v16.4S, v31.s[0] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v17.4S, v25.s[0] -str q19, [x0, #704] -ldr q19, [x17, #+608] -ldr q16, [x17, #+624] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v6.4s -sqrdmulh v2.4S, v15.4S, v16.s[0] -str q11, [x0, #752] -ldr q11, [x0, #1008] -mla v3.4S, v22.4S, v31.s[0] -add v30.4s, v30.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v16.s[0] -str q30, [x0, #736] -ldr q30, [x0, #768] -ldr q22, [x0, #896] -mul v29.4S, v29.4S,v7.s[0] -sub v21.4s, v30.4s, v9.4s -ldr q12, [x0, #784] -mul v17.4S, v17.4S,v7.s[0] -add v30.4s, v30.4s, v9.4s -ldr q9, [x0, #912] -mla v29.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v1.4s -ldr q20, [x0, #832] -mla v17.4S, v8.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -ldr q1, [x0, #960] -mul v15.4S, v15.4S,v19.s[0] -sub v8.4s, v20.4s, v14.4s -ldr q27, [x0, #848] -mul v11.4S, v11.4S,v19.s[0] -add v20.4s, v20.4s, v14.4s -ldr q14, [x0, #976] -mla v15.4S, v2.4S, v31.s[0] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v27.4s, v3.4s -sqrdmulh v2.4S, v12.4S, v24.s[1] -add v27.4s, v27.4s, v3.4s -mul v12.4S, v12.4S,v0.s[1] -sqrdmulh v3.4S, v13.4S, v24.s[2] -sub v5.4s, v22.4s, v29.4s -mul v13.4S, v13.4S,v0.s[2] -add v22.4s, v22.4s, v29.4s -sqrdmulh v24.4S, v27.4S, v4.s[1] -sub v0.4s, v9.4s, v17.4s -mul v27.4S, v27.4S,v26.s[1] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v6.4S, v4.s[2] -sub v29.4s, v1.4s, v15.4s -mul v6.4S, v6.4S,v26.s[2] -add v1.4s, v1.4s, v15.4s -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v11.4s -sqrdmulh v4.4S, v9.4S, v25.s[1] -add v14.4s, v14.4s, v11.4s -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v0.4S, v25.s[2] -sub v11.4s, v30.4s, v12.4s -mla v27.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v14.4S, v16.s[1] -add v30.4s, v30.4s, v12.4s -str q11, [x0, #784] -mla v6.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v2.4S, v16.s[2] -sub v11.4s, v21.4s, v13.4s -str q30, [x0, #768] -mul v9.4S, v9.4S,v7.s[1] -add v21.4s, v21.4s, v13.4s -mul v0.4S, v0.4S,v7.s[2] -str q11, [x0, #816] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v27.4s -mla v0.4S, v3.4S, v31.s[0] -str q21, [x0, #800] -mul v14.4S, v14.4S,v19.s[1] -str q4, [x0, #848] -mul v2.4S, v2.4S,v19.s[2] -add v20.4s, v20.4s, v27.4s -str q20, [x0, #832] -mla v14.4S, v24.4S, v31.s[0] -sub v24.4s, v8.4s, v6.4s -str q24, [x0, #880] -mla v2.4S, v17.4S, v31.s[0] -add v8.4s, v8.4s, v6.4s -str q8, [x0, #864] -sub v16.4s, v22.4s, v9.4s -str q16, [x0, #912] -add v22.4s, v22.4s, v9.4s -str q22, [x0, #896] -sub v22.4s, v5.4s, v0.4s -str q22, [x0, #944] -add v5.4s, v5.4s, v0.4s -str q5, [x0, #928] -sub v5.4s, v1.4s, v14.4s -str q5, [x0, #976] -add v1.4s, v1.4s, v14.4s -str q1, [x0, #960] -sub v1.4s, v29.4s, v2.4s -str q1, [x0, #1008] -add v29.4s, v29.4s, v2.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s deleted file mode 100644 index 4533d8c..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_13_z4_7.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_13_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -ldr q2, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v2.4S, v29.s[0] -ldr q1, [x0, #608] -mla v20.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v1.4S, v29.s[0] -ldr q0, [x0, #672] -mla v18.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v0.4S, v29.s[0] -ldr q15, [x0, #736] -mla v16.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v15.4S, v29.s[0] -ldr q14, [x0, #416] -ldr q13, [x0, #480] -mul v2.4S, v2.4S,v30.s[0] -sub v12.4s, v14.4s, v22.4s -mul v1.4S, v1.4S,v30.s[0] -add v14.4s, v14.4s, v22.4s -ldr q22, [x0, #288] -ldr q11, [x0, #352] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v20.4s -mla v1.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v20.4s -ldr q20, [x0, #32] -ldr q19, [x0, #96] -mul v0.4S, v0.4S,v30.s[0] -sub v10.4s, v22.4s, v18.4s -mul v15.4S, v15.4S,v30.s[0] -add v22.4s, v22.4s, v18.4s -ldr q18, [x0, #160] -ldr q9, [x0, #224] -mla v0.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v16.4s -mla v15.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sqrdmulh v3.4S, v13.4S, v29.s[1] -sub v8.4s, v20.4s, v2.4s -mul v13.4S, v13.4S,v30.s[1] -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v22.4S, v29.s[1] -sub v7.4s, v19.4s, v1.4s -mul v22.4S, v22.4S,v30.s[1] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v11.4S, v29.s[1] -sub v6.4s, v18.4s, v0.4s -mul v11.4S, v11.4S,v30.s[1] -add v18.4s, v18.4s, v0.4s -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v9.4s, v15.4s -sqrdmulh v0.4S, v12.4S, v29.s[2] -add v9.4s, v9.4s, v15.4s -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v21.4S, v29.s[2] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v10.4S, v29.s[2] -mla v11.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v17.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v18.4s, v14.4s -mul v21.4S, v21.4S,v30.s[2] -add v18.4s, v18.4s, v14.4s -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v13.4s -mla v21.4S, v3.4S, v31.s[0] -add v9.4s, v9.4s, v13.4s -mul v10.4S, v10.4S,v30.s[2] -sub v13.4s, v20.4s, v22.4s -mul v17.4S, v17.4S,v30.s[2] -add v20.4s, v20.4s, v22.4s -mla v10.4S, v2.4S, v31.s[0] -sub v2.4s, v19.4s, v11.4s -mla v17.4S, v1.4S, v31.s[0] -add v19.4s, v19.4s, v11.4s -sqrdmulh v11.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -sqrdmulh v1.4S, v0.4S, v27.s[1] -sub v22.4s, v6.4s, v12.4s -mul v0.4S, v0.4S,v28.s[1] -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v27.s[0] -sub v3.4s, v16.4s, v21.4s -mul v18.4S, v18.4S,v28.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v9.4S, v27.s[0] -sub v14.4s, v8.4s, v10.4s -mul v9.4S, v9.4S,v28.s[0] -add v8.4s, v8.4s, v10.4s -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v7.4s, v17.4s -sqrdmulh v10.4S, v6.4S, v27.s[2] -add v7.4s, v7.4s, v17.4s -mla v0.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v16.4S, v27.s[2] -mla v18.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v22.4S, v27.s[3] -mla v9.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v3.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[2] -sub v17.4s, v13.4s, v15.4s -mul v16.4S, v16.4S,v28.s[2] -add v13.4s, v13.4s, v15.4s -mla v6.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v0.4s -mla v16.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -mul v22.4S, v22.4S,v28.s[3] -sub v0.4s, v20.4s, v18.4s -mul v3.4S, v3.4S,v28.s[3] -add v20.4s, v20.4s, v18.4s -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v9.4s -mla v3.4S, v21.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -sqrdmulh v21.4S, v10.4S, v25.s[3] -sub v18.4s, v8.4s, v6.4s -mul v10.4S, v10.4S,v26.s[3] -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v12.4S, v25.s[1] -sub v1.4s, v7.4s, v16.4s -mul v12.4S, v12.4S,v26.s[1] -add v7.4s, v7.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v25.s[0] -sub v15.4s, v14.4s, v22.4s -mul v19.4S, v19.4S,v26.s[0] -add v14.4s, v14.4s, v22.4s -mla v2.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v3.4s -sqrdmulh v22.4S, v7.4S, v23.s[0] -add v11.4s, v11.4s, v3.4s -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v13.4s, v2.4s -sqrdmulh v3.4S, v1.4S, v23.s[1] -add v13.4s, v13.4s, v2.4s -mla v12.4S, v6.4S, v31.s[0] -sub v6.4s, v17.4s, v10.4s -sqrdmulh v2.4S, v11.4S, v23.s[2] -add v17.4s, v17.4s, v10.4s -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v12.4s -sqrdmulh v10.4S, v9.4S, v23.s[3] -add v0.4s, v0.4s, v12.4s -mul v7.4S, v7.4S,v24.s[0] -sub v12.4s, v20.4s, v19.4s -mul v1.4S, v1.4S,v24.s[1] -add v20.4s, v20.4s, v19.4s -mla v7.4S, v22.4S, v31.s[0] -str q21, [x0, #352] -mla v1.4S, v3.4S, v31.s[0] -str q13, [x0, #288] -mul v11.4S, v11.4S,v24.s[2] -str q6, [x0, #480] -mul v9.4S, v9.4S,v24.s[3] -str q17, [x0, #416] -mla v11.4S, v2.4S, v31.s[0] -str q16, [x0, #224] -mla v9.4S, v10.4S, v31.s[0] -str q0, [x0, #160] -ldr q0, [x0, #944] -sqrdmulh v10.4S, v0.4S, v29.s[0] -str q12, [x0, #96] -mul v0.4S, v0.4S,v30.s[0] -str q20, [x0, #32] -ldr q20, [x0, #1008] -sqrdmulh v12.4S, v20.4S, v29.s[0] -sub v16.4s, v8.4s, v7.4s -str q16, [x0, #608] -mul v20.4S, v20.4S,v30.s[0] -add v8.4s, v8.4s, v7.4s -ldr q7, [x0, #816] -sqrdmulh v16.4S, v7.4S, v29.s[0] -sub v2.4s, v18.4s, v1.4s -str q8, [x0, #544] -mul v7.4S, v7.4S,v30.s[0] -add v18.4s, v18.4s, v1.4s -ldr q1, [x0, #880] -sqrdmulh v8.4S, v1.4S, v29.s[0] -sub v17.4s, v14.4s, v11.4s -str q2, [x0, #736] -mul v1.4S, v1.4S,v30.s[0] -add v14.4s, v14.4s, v11.4s -ldr q11, [x0, #560] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v9.4s -str q18, [x0, #672] -sqrdmulh v18.4S, v11.4S, v29.s[0] -add v15.4s, v15.4s, v9.4s -ldr q9, [x0, #624] -mla v20.4S, v12.4S, v31.s[0] -str q17, [x0, #864] -sqrdmulh v17.4S, v9.4S, v29.s[0] -ldr q12, [x0, #688] -mla v7.4S, v16.4S, v31.s[0] -str q14, [x0, #800] -sqrdmulh v14.4S, v12.4S, v29.s[0] -ldr q16, [x0, #752] -mla v1.4S, v8.4S, v31.s[0] -str q10, [x0, #992] -sqrdmulh v10.4S, v16.4S, v29.s[0] -ldr q8, [x0, #432] -ldr q2, [x0, #496] -mul v11.4S, v11.4S,v30.s[0] -sub v6.4s, v8.4s, v0.4s -str q15, [x0, #928] -mul v9.4S, v9.4S,v30.s[0] -add v8.4s, v8.4s, v0.4s -ldr q0, [x0, #304] -ldr q15, [x0, #368] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v20.4s -mla v9.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v20.4s -ldr q20, [x0, #48] -ldr q17, [x0, #112] -mul v12.4S, v12.4S,v30.s[0] -sub v13.4s, v0.4s, v7.4s -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v7.4s -ldr q7, [x0, #176] -ldr q3, [x0, #240] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v1.4s -mla v16.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sqrdmulh v10.4S, v2.4S, v29.s[1] -sub v21.4s, v20.4s, v11.4s -mul v2.4S, v2.4S,v30.s[1] -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v0.4S, v29.s[1] -sub v22.4s, v17.4s, v9.4s -mul v0.4S, v0.4S,v30.s[1] -add v17.4s, v17.4s, v9.4s -sqrdmulh v9.4S, v15.4S, v29.s[1] -sub v19.4s, v7.4s, v12.4s -mul v15.4S, v15.4S,v30.s[1] -add v7.4s, v7.4s, v12.4s -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v3.4s, v16.4s -sqrdmulh v12.4S, v6.4S, v29.s[2] -add v3.4s, v3.4s, v16.4s -mla v2.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v18.4S, v29.s[2] -mla v0.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v13.4S, v29.s[2] -mla v15.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v14.4S, v29.s[2] -mul v6.4S, v6.4S,v30.s[2] -sub v16.4s, v7.4s, v8.4s -mul v18.4S, v18.4S,v30.s[2] -add v7.4s, v7.4s, v8.4s -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v2.4s -mla v18.4S, v10.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -mul v13.4S, v13.4S,v30.s[2] -sub v2.4s, v20.4s, v0.4s -mul v14.4S, v14.4S,v30.s[2] -add v20.4s, v20.4s, v0.4s -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v15.4s -mla v14.4S, v9.4S, v31.s[0] -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sqrdmulh v9.4S, v12.4S, v27.s[1] -sub v0.4s, v19.4s, v6.4s -mul v12.4S, v12.4S,v28.s[1] -add v19.4s, v19.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v27.s[0] -sub v10.4s, v1.4s, v18.4s -mul v7.4S, v7.4S,v28.s[0] -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v27.s[0] -sub v8.4s, v21.4s, v13.4s -mul v3.4S, v3.4S,v28.s[0] -add v21.4s, v21.4s, v13.4s -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v22.4s, v14.4s -sqrdmulh v13.4S, v19.4S, v27.s[2] -add v22.4s, v22.4s, v14.4s -mla v12.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v1.4S, v27.s[2] -mla v7.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v0.4S, v27.s[3] -mla v3.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v10.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[2] -sub v14.4s, v2.4s, v16.4s -mul v1.4S, v1.4S,v28.s[2] -add v2.4s, v2.4s, v16.4s -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v12.4s -mla v1.4S, v9.4S, v31.s[0] -add v11.4s, v11.4s, v12.4s -mul v0.4S, v0.4S,v28.s[3] -sub v12.4s, v20.4s, v7.4s -mul v10.4S, v10.4S,v28.s[3] -add v20.4s, v20.4s, v7.4s -mla v0.4S, v6.4S, v31.s[0] -sub v6.4s, v17.4s, v3.4s -mla v10.4S, v18.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[2] -mul v11.4S, v11.4S,v26.s[2] -sqrdmulh v18.4S, v13.4S, v25.s[3] -sub v7.4s, v21.4s, v19.4s -mul v13.4S, v13.4S,v26.s[3] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v25.s[1] -sub v9.4s, v22.4s, v1.4s -mul v6.4S, v6.4S,v26.s[1] -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v25.s[0] -sub v16.4s, v8.4s, v0.4s -mul v17.4S, v17.4S,v26.s[0] -add v8.4s, v8.4s, v0.4s -mla v11.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v10.4s -sqrdmulh v0.4S, v22.4S, v23.s[0] -add v15.4s, v15.4s, v10.4s -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v11.4s -sqrdmulh v10.4S, v9.4S, v23.s[1] -add v2.4s, v2.4s, v11.4s -mla v6.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v13.4s -sqrdmulh v11.4S, v15.4S, v23.s[2] -add v14.4s, v14.4s, v13.4s -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v6.4s -sqrdmulh v13.4S, v3.4S, v23.s[3] -add v12.4s, v12.4s, v6.4s -mul v22.4S, v22.4S,v24.s[0] -sub v6.4s, v20.4s, v17.4s -mul v9.4S, v9.4S,v24.s[1] -add v20.4s, v20.4s, v17.4s -mla v22.4S, v0.4S, v31.s[0] -str q18, [x0, #368] -mla v9.4S, v10.4S, v31.s[0] -str q2, [x0, #304] -mul v15.4S, v15.4S,v24.s[2] -str q19, [x0, #496] -mul v3.4S, v3.4S,v24.s[3] -str q14, [x0, #432] -mla v15.4S, v11.4S, v31.s[0] -str q1, [x0, #240] -mla v3.4S, v13.4S, v31.s[0] -str q12, [x0, #176] -ldr q12, [x0, #896] -sqrdmulh v13.4S, v12.4S, v29.s[0] -str q6, [x0, #112] -mul v12.4S, v12.4S,v30.s[0] -str q20, [x0, #48] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -sub v1.4s, v21.4s, v22.4s -str q1, [x0, #624] -mul v20.4S, v20.4S,v30.s[0] -add v21.4s, v21.4s, v22.4s -ldr q22, [x0, #768] -sqrdmulh v1.4S, v22.4S, v29.s[0] -sub v11.4s, v7.4s, v9.4s -str q21, [x0, #560] -mul v22.4S, v22.4S,v30.s[0] -add v7.4s, v7.4s, v9.4s -ldr q9, [x0, #832] -sqrdmulh v21.4S, v9.4S, v29.s[0] -sub v14.4s, v8.4s, v15.4s -str q11, [x0, #752] -mul v9.4S, v9.4S,v30.s[0] -add v8.4s, v8.4s, v15.4s -ldr q15, [x0, #512] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v3.4s -str q7, [x0, #688] -sqrdmulh v7.4S, v15.4S, v29.s[0] -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #576] -mla v20.4S, v6.4S, v31.s[0] -str q14, [x0, #880] -sqrdmulh v14.4S, v3.4S, v29.s[0] -ldr q6, [x0, #640] -mla v22.4S, v1.4S, v31.s[0] -str q8, [x0, #816] -sqrdmulh v8.4S, v6.4S, v29.s[0] -ldr q1, [x0, #704] -mla v9.4S, v21.4S, v31.s[0] -str q13, [x0, #1008] -sqrdmulh v13.4S, v1.4S, v29.s[0] -ldr q21, [x0, #384] -ldr q11, [x0, #448] -mul v15.4S, v15.4S,v30.s[0] -sub v19.4s, v21.4s, v12.4s -str q16, [x0, #944] -mul v3.4S, v3.4S,v30.s[0] -add v21.4s, v21.4s, v12.4s -ldr q12, [x0, #256] -ldr q16, [x0, #320] -mla v15.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v20.4s -mla v3.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -ldr q20, [x0, #0] -ldr q14, [x0, #64] -mul v6.4S, v6.4S,v30.s[0] -sub v2.4s, v12.4s, v22.4s -mul v1.4S, v1.4S,v30.s[0] -add v12.4s, v12.4s, v22.4s -ldr q22, [x0, #128] -ldr q10, [x0, #192] -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v16.4s, v9.4s -mla v1.4S, v13.4S, v31.s[0] -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sqrdmulh v13.4S, v11.4S, v29.s[1] -sub v18.4s, v20.4s, v15.4s -mul v11.4S, v11.4S,v30.s[1] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v29.s[1] -sub v0.4s, v14.4s, v3.4s -mul v12.4S, v12.4S,v30.s[1] -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v29.s[1] -sub v17.4s, v22.4s, v6.4s -mul v16.4S, v16.4S,v30.s[1] -add v22.4s, v22.4s, v6.4s -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v1.4s -sqrdmulh v6.4S, v19.4S, v29.s[2] -add v10.4s, v10.4s, v1.4s -mla v11.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v7.4S, v29.s[2] -mla v12.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v2.4S, v29.s[2] -mla v16.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v8.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v1.4s, v22.4s, v21.4s -mul v7.4S, v7.4S,v30.s[2] -add v22.4s, v22.4s, v21.4s -mla v19.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v11.4s -mla v7.4S, v13.4S, v31.s[0] -add v10.4s, v10.4s, v11.4s -mul v2.4S, v2.4S,v30.s[2] -sub v11.4s, v20.4s, v12.4s -mul v8.4S, v8.4S,v30.s[2] -add v20.4s, v20.4s, v12.4s -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v16.4s -mla v8.4S, v3.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v27.s[1] -mul v1.4S, v1.4S,v28.s[1] -sqrdmulh v3.4S, v6.4S, v27.s[1] -sub v12.4s, v17.4s, v19.4s -mul v6.4S, v6.4S,v28.s[1] -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v22.4S, v27.s[0] -sub v13.4s, v9.4s, v7.4s -mul v22.4S, v22.4S,v28.s[0] -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v10.4S, v27.s[0] -sub v21.4s, v18.4s, v2.4s -mul v10.4S, v10.4S,v28.s[0] -add v18.4s, v18.4s, v2.4s -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v8.4s -sqrdmulh v2.4S, v17.4S, v27.s[2] -add v0.4s, v0.4s, v8.4s -mla v6.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v9.4S, v27.s[2] -mla v22.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v12.4S, v27.s[3] -mla v10.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v13.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[2] -sub v8.4s, v11.4s, v1.4s -mul v9.4S, v9.4S,v28.s[2] -add v11.4s, v11.4s, v1.4s -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v6.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v6.4s -mul v12.4S, v12.4S,v28.s[3] -sub v6.4s, v20.4s, v22.4s -mul v13.4S, v13.4S,v28.s[3] -add v20.4s, v20.4s, v22.4s -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v10.4s -mla v13.4S, v7.4S, v31.s[0] -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v25.s[2] -mul v15.4S, v15.4S,v26.s[2] -sqrdmulh v7.4S, v2.4S, v25.s[3] -sub v22.4s, v18.4s, v17.4s -mul v2.4S, v2.4S,v26.s[3] -add v18.4s, v18.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v25.s[1] -sub v3.4s, v0.4s, v9.4s -mul v19.4S, v19.4S,v26.s[1] -add v0.4s, v0.4s, v9.4s -sqrdmulh v9.4S, v14.4S, v25.s[0] -sub v1.4s, v21.4s, v12.4s -mul v14.4S, v14.4S,v26.s[0] -add v21.4s, v21.4s, v12.4s -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v13.4s -sqrdmulh v12.4S, v0.4S, v23.s[0] -add v16.4s, v16.4s, v13.4s -mla v2.4S, v7.4S, v31.s[0] -sub v7.4s, v11.4s, v15.4s -sqrdmulh v13.4S, v3.4S, v23.s[1] -add v11.4s, v11.4s, v15.4s -mla v19.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v2.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -add v8.4s, v8.4s, v2.4s -mla v14.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v19.4s -sqrdmulh v2.4S, v10.4S, v23.s[3] -add v6.4s, v6.4s, v19.4s -mul v0.4S, v0.4S,v24.s[0] -sub v19.4s, v20.4s, v14.4s -mul v3.4S, v3.4S,v24.s[1] -add v20.4s, v20.4s, v14.4s -mla v0.4S, v12.4S, v31.s[0] -str q7, [x0, #320] -mla v3.4S, v13.4S, v31.s[0] -str q11, [x0, #256] -mul v16.4S, v16.4S,v24.s[2] -str q17, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -str q8, [x0, #384] -mla v16.4S, v15.4S, v31.s[0] -str q9, [x0, #192] -mla v10.4S, v2.4S, v31.s[0] -str q6, [x0, #128] -ldr q6, [x0, #912] -sqrdmulh v2.4S, v6.4S, v29.s[0] -str q19, [x0, #64] -mul v6.4S, v6.4S,v30.s[0] -str q20, [x0, #0] -ldr q20, [x0, #976] -sqrdmulh v19.4S, v20.4S, v29.s[0] -sub v9.4s, v18.4s, v0.4s -str q9, [x0, #576] -mul v20.4S, v20.4S,v30.s[0] -add v18.4s, v18.4s, v0.4s -ldr q0, [x0, #784] -sqrdmulh v9.4S, v0.4S, v29.s[0] -sub v15.4s, v22.4s, v3.4s -str q18, [x0, #512] -mul v0.4S, v0.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -ldr q3, [x0, #848] -sqrdmulh v18.4S, v3.4S, v29.s[0] -sub v8.4s, v21.4s, v16.4s -str q15, [x0, #704] -mul v3.4S, v3.4S,v30.s[0] -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #528] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v10.4s -str q22, [x0, #640] -sqrdmulh v22.4S, v16.4S, v29.s[0] -add v1.4s, v1.4s, v10.4s -ldr q10, [x0, #592] -mla v20.4S, v19.4S, v31.s[0] -str q8, [x0, #832] -sqrdmulh v8.4S, v10.4S, v29.s[0] -ldr q19, [x0, #656] -mla v0.4S, v9.4S, v31.s[0] -str q21, [x0, #768] -sqrdmulh v21.4S, v19.4S, v29.s[0] -ldr q9, [x0, #720] -mla v3.4S, v18.4S, v31.s[0] -str q2, [x0, #960] -sqrdmulh v2.4S, v9.4S, v29.s[0] -ldr q18, [x0, #400] -ldr q15, [x0, #464] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v18.4s, v6.4s -str q1, [x0, #896] -mul v10.4S, v10.4S,v30.s[0] -add v18.4s, v18.4s, v6.4s -ldr q6, [x0, #272] -ldr q1, [x0, #336] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v20.4s -mla v10.4S, v8.4S, v31.s[0] -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #16] -ldr q8, [x0, #80] -mul v19.4S, v19.4S,v30.s[0] -sub v11.4s, v6.4s, v0.4s -mul v9.4S, v9.4S,v30.s[0] -add v6.4s, v6.4s, v0.4s -ldr q0, [x0, #144] -ldr q13, [x0, #208] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v3.4s -mla v9.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v3.4s -sqrdmulh v3.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sqrdmulh v2.4S, v15.4S, v29.s[1] -sub v7.4s, v20.4s, v16.4s -mul v15.4S, v15.4S,v30.s[1] -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v29.s[1] -sub v12.4s, v8.4s, v10.4s -mul v6.4S, v6.4S,v30.s[1] -add v8.4s, v8.4s, v10.4s -sqrdmulh v10.4S, v1.4S, v29.s[1] -sub v14.4s, v0.4s, v19.4s -mul v1.4S, v1.4S,v30.s[1] -add v0.4s, v0.4s, v19.4s -mla v18.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v9.4s -sqrdmulh v19.4S, v17.4S, v29.s[2] -add v13.4s, v13.4s, v9.4s -mla v15.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v22.4S, v29.s[2] -mla v6.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v11.4S, v29.s[2] -mla v1.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v21.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v9.4s, v0.4s, v18.4s -mul v22.4S, v22.4S,v30.s[2] -add v0.4s, v0.4s, v18.4s -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v15.4s -mla v22.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v15.4s -mul v11.4S, v11.4S,v30.s[2] -sub v15.4s, v20.4s, v6.4s -mul v21.4S, v21.4S,v30.s[2] -add v20.4s, v20.4s, v6.4s -mla v11.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v1.4s -mla v21.4S, v10.4S, v31.s[0] -add v8.4s, v8.4s, v1.4s -sqrdmulh v29.4S, v9.4S, v27.s[1] -mul v9.4S, v9.4S,v28.s[1] -sqrdmulh v30.4S, v19.4S, v27.s[1] -sub v1.4s, v14.4s, v17.4s -mul v19.4S, v19.4S,v28.s[1] -add v14.4s, v14.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -sub v10.4s, v3.4s, v22.4s -mul v0.4S, v0.4S,v28.s[0] -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v27.s[0] -sub v6.4s, v7.4s, v11.4s -mul v13.4S, v13.4S,v28.s[0] -add v7.4s, v7.4s, v11.4s -mla v9.4S, v29.4S, v31.s[0] -sub v29.4s, v12.4s, v21.4s -sqrdmulh v11.4S, v14.4S, v27.s[2] -add v12.4s, v12.4s, v21.4s -mla v19.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v3.4S, v27.s[2] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v1.4S, v27.s[3] -mla v13.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v10.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[2] -sub v21.4s, v15.4s, v9.4s -mul v3.4S, v3.4S,v28.s[2] -add v15.4s, v15.4s, v9.4s -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v19.4s -mla v3.4S, v30.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -mul v1.4S, v1.4S,v28.s[3] -sub v19.4s, v20.4s, v0.4s -mul v10.4S, v10.4S,v28.s[3] -add v20.4s, v20.4s, v0.4s -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v13.4s -mla v10.4S, v22.4S, v31.s[0] -add v8.4s, v8.4s, v13.4s -sqrdmulh v27.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sqrdmulh v28.4S, v11.4S, v25.s[3] -sub v13.4s, v7.4s, v14.4s -mul v11.4S, v11.4S,v26.s[3] -add v7.4s, v7.4s, v14.4s -sqrdmulh v14.4S, v17.4S, v25.s[1] -sub v22.4s, v12.4s, v3.4s -mul v17.4S, v17.4S,v26.s[1] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v8.4S, v25.s[0] -sub v0.4s, v6.4s, v1.4s -mul v8.4S, v8.4S,v26.s[0] -add v6.4s, v6.4s, v1.4s -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v10.4s -sqrdmulh v25.4S, v12.4S, v23.s[0] -add v29.4s, v29.4s, v10.4s -mla v11.4S, v28.4S, v31.s[0] -sub v28.4s, v15.4s, v16.4s -sqrdmulh v10.4S, v22.4S, v23.s[1] -add v15.4s, v15.4s, v16.4s -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v11.4s -sqrdmulh v16.4S, v29.4S, v23.s[2] -add v21.4s, v21.4s, v11.4s -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v17.4s -sqrdmulh v11.4S, v27.4S, v23.s[3] -add v19.4s, v19.4s, v17.4s -mul v12.4S, v12.4S,v24.s[0] -sub v17.4s, v20.4s, v8.4s -mul v22.4S, v22.4S,v24.s[1] -add v20.4s, v20.4s, v8.4s -mla v12.4S, v25.4S, v31.s[0] -str q28, [x0, #336] -mla v22.4S, v10.4S, v31.s[0] -str q15, [x0, #272] -mul v29.4S, v29.4S,v24.s[2] -str q14, [x0, #464] -mul v27.4S, v27.4S,v24.s[3] -str q21, [x0, #400] -mla v29.4S, v16.4S, v31.s[0] -str q3, [x0, #208] -mla v27.4S, v11.4S, v31.s[0] -str q19, [x0, #144] -str q17, [x0, #80] -str q20, [x0, #16] -sub v20.4s, v7.4s, v12.4s -str q20, [x0, #592] -add v7.4s, v7.4s, v12.4s -sub v12.4s, v13.4s, v22.4s -str q7, [x0, #528] -add v13.4s, v13.4s, v22.4s -sub v22.4s, v6.4s, v29.4s -str q12, [x0, #720] -add v6.4s, v6.4s, v29.4s -sub v29.4s, v0.4s, v27.4s -str q13, [x0, #656] -add v0.4s, v0.4s, v27.4s -str q22, [x0, #848] -str q6, [x0, #784] -str q29, [x0, #976] -str q0, [x0, #912] -ldr q4, [x0, #224] -ldr q5, [x0, #160] -ldr q18, [x0, #32] -ldr q2, [x17, #+128] -ldr q9, [x17, #+144] -sqrdmulh v30.4S, v18.4S, v9.s[0] -mul v18.4S, v18.4S,v2.s[0] -ldr q1, [x0, #48] -sqrdmulh v26.4S, v1.4S, v9.s[0] -mul v1.4S, v1.4S,v2.s[0] -ldr q8, [x17, #+160] -ldr q25, [x17, #+176] -ldr q28, [x0, #96] -sqrdmulh v10.4S, v28.4S, v25.s[0] -mul v28.4S, v28.4S,v8.s[0] -ldr q15, [x0, #112] -sqrdmulh v14.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v8.s[0] -ldr q21, [x17, #+192] -ldr q16, [x17, #+208] -mla v18.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v5.4S, v16.s[0] -ldr q3, [x0, #176] -mla v1.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v3.4S, v16.s[0] -ldr q11, [x17, #+224] -ldr q19, [x17, #+240] -mla v28.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v4.4S, v19.s[0] -ldr q24, [x0, #240] -mla v15.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v24.4S, v19.s[0] -ldr q23, [x0, #0] -ldr q17, [x0, #128] -mul v5.4S, v5.4S,v21.s[0] -sub v20.4s, v23.4s, v18.4s -ldr q7, [x0, #16] -mul v3.4S, v3.4S,v21.s[0] -add v23.4s, v23.4s, v18.4s -ldr q18, [x0, #144] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v7.4s, v1.4s -ldr q12, [x0, #64] -mla v3.4S, v26.4S, v31.s[0] -add v7.4s, v7.4s, v1.4s -ldr q1, [x0, #192] -mul v4.4S, v4.4S,v11.s[0] -sub v26.4s, v12.4s, v28.4s -ldr q13, [x0, #80] -mul v24.4S, v24.4S,v11.s[0] -add v12.4s, v12.4s, v28.4s -ldr q28, [x0, #208] -mla v4.4S, v10.4S, v31.s[0] -mla v24.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v15.4s -sqrdmulh v10.4S, v7.4S, v9.s[1] -add v13.4s, v13.4s, v15.4s -mul v7.4S, v7.4S,v2.s[1] -sqrdmulh v15.4S, v30.4S, v9.s[2] -sub v27.4s, v17.4s, v5.4s -mul v30.4S, v30.4S,v2.s[2] -add v17.4s, v17.4s, v5.4s -sqrdmulh v9.4S, v13.4S, v25.s[1] -sub v2.4s, v18.4s, v3.4s -mul v13.4S, v13.4S,v8.s[1] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[2] -sub v5.4s, v1.4s, v4.4s -mul v14.4S, v14.4S,v8.s[2] -add v1.4s, v1.4s, v4.4s -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v24.4s -ldr q25, [x0, #480] -sqrdmulh v8.4S, v18.4S, v16.s[1] -add v28.4s, v28.4s, v24.4s -mla v30.4S, v15.4S, v31.s[0] -ldr q15, [x0, #416] -sqrdmulh v24.4S, v2.4S, v16.s[2] -sub v4.4s, v23.4s, v7.4s -mla v13.4S, v9.4S, v31.s[0] -ldr q9, [x0, #288] -sqrdmulh v22.4S, v28.4S, v19.s[1] -add v23.4s, v23.4s, v7.4s -str q4, [x0, #16] -mla v14.4S, v3.4S, v31.s[0] -ldr q3, [x17, #+256] -ldr q4, [x17, #+272] -sqrdmulh v7.4S, v10.4S, v19.s[2] -sub v6.4s, v20.4s, v30.4s -str q23, [x0, #0] -mul v18.4S, v18.4S,v21.s[1] -add v20.4s, v20.4s, v30.4s -mul v2.4S, v2.4S,v21.s[2] -str q6, [x0, #48] -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v13.4s -mla v2.4S, v24.4S, v31.s[0] -str q20, [x0, #32] -mul v28.4S, v28.4S,v11.s[1] -str q8, [x0, #80] -mul v10.4S, v10.4S,v11.s[2] -add v12.4s, v12.4s, v13.4s -str q12, [x0, #64] -mla v28.4S, v22.4S, v31.s[0] -sub v22.4s, v26.4s, v14.4s -str q22, [x0, #112] -mla v10.4S, v7.4S, v31.s[0] -add v26.4s, v26.4s, v14.4s -str q26, [x0, #96] -sqrdmulh v19.4S, v9.4S, v4.s[0] -sub v11.4s, v17.4s, v18.4s -mul v9.4S, v9.4S,v3.s[0] -str q11, [x0, #144] -ldr q11, [x0, #304] -sqrdmulh v26.4S, v11.4S, v4.s[0] -add v17.4s, v17.4s, v18.4s -mul v11.4S, v11.4S,v3.s[0] -str q17, [x0, #128] -ldr q17, [x17, #+288] -ldr q18, [x17, #+304] -ldr q14, [x0, #352] -sqrdmulh v7.4S, v14.4S, v18.s[0] -sub v22.4s, v27.4s, v2.4s -mul v14.4S, v14.4S,v17.s[0] -str q22, [x0, #176] -ldr q22, [x0, #368] -sqrdmulh v12.4S, v22.4S, v18.s[0] -add v27.4s, v27.4s, v2.4s -mul v22.4S, v22.4S,v17.s[0] -str q27, [x0, #160] -ldr q27, [x17, #+320] -ldr q2, [x17, #+336] -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v1.4s, v28.4s -sqrdmulh v13.4S, v15.4S, v2.s[0] -str q19, [x0, #208] -ldr q19, [x0, #432] -mla v11.4S, v26.4S, v31.s[0] -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v2.s[0] -str q1, [x0, #192] -ldr q1, [x17, #+352] -ldr q26, [x17, #+368] -mla v14.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v10.4s -sqrdmulh v8.4S, v25.4S, v26.s[0] -str q7, [x0, #240] -ldr q7, [x0, #496] -mla v22.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v10.4s -sqrdmulh v10.4S, v7.4S, v26.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q12, [x0, #384] -mul v15.4S, v15.4S,v27.s[0] -sub v16.4s, v5.4s, v9.4s -ldr q21, [x0, #272] -mul v19.4S, v19.4S,v27.s[0] -add v5.4s, v5.4s, v9.4s -ldr q9, [x0, #400] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v11.4s -ldr q20, [x0, #320] -mla v19.4S, v28.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -ldr q11, [x0, #448] -mul v25.4S, v25.4S,v1.s[0] -sub v28.4s, v20.4s, v14.4s -ldr q24, [x0, #336] -mul v7.4S, v7.4S,v1.s[0] -add v20.4s, v20.4s, v14.4s -ldr q14, [x0, #464] -mla v25.4S, v8.4S, v31.s[0] -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v24.4s, v22.4s -sqrdmulh v8.4S, v21.4S, v4.s[1] -add v24.4s, v24.4s, v22.4s -mul v21.4S, v21.4S,v3.s[1] -sqrdmulh v22.4S, v13.4S, v4.s[2] -sub v6.4s, v12.4s, v15.4s -mul v13.4S, v13.4S,v3.s[2] -add v12.4s, v12.4s, v15.4s -sqrdmulh v4.4S, v24.4S, v18.s[1] -sub v3.4s, v9.4s, v19.4s -mul v24.4S, v24.4S,v17.s[1] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v10.4S, v18.s[2] -sub v15.4s, v11.4s, v25.4s -mul v10.4S, v10.4S,v17.s[2] -add v11.4s, v11.4s, v25.4s -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v14.4s, v7.4s -ldr q18, [x0, #736] -sqrdmulh v17.4S, v9.4S, v2.s[1] -add v14.4s, v14.4s, v7.4s -mla v13.4S, v22.4S, v31.s[0] -ldr q22, [x0, #672] -sqrdmulh v7.4S, v3.4S, v2.s[2] -sub v25.4s, v5.4s, v21.4s -mla v24.4S, v4.4S, v31.s[0] -ldr q4, [x0, #544] -sqrdmulh v30.4S, v14.4S, v26.s[1] -add v5.4s, v5.4s, v21.4s -str q25, [x0, #272] -mla v10.4S, v19.4S, v31.s[0] -ldr q19, [x17, #+384] -ldr q25, [x17, #+400] -sqrdmulh v21.4S, v8.4S, v26.s[2] -sub v23.4s, v16.4s, v13.4s -str q5, [x0, #256] -mul v9.4S, v9.4S,v27.s[1] -add v16.4s, v16.4s, v13.4s -mul v3.4S, v3.4S,v27.s[2] -str q23, [x0, #304] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v24.4s -mla v3.4S, v7.4S, v31.s[0] -str q16, [x0, #288] -mul v14.4S, v14.4S,v1.s[1] -str q17, [x0, #336] -mul v8.4S, v8.4S,v1.s[2] -add v20.4s, v20.4s, v24.4s -str q20, [x0, #320] -mla v14.4S, v30.4S, v31.s[0] -sub v30.4s, v28.4s, v10.4s -str q30, [x0, #368] -mla v8.4S, v21.4S, v31.s[0] -add v28.4s, v28.4s, v10.4s -str q28, [x0, #352] -sqrdmulh v26.4S, v4.4S, v25.s[0] -sub v1.4s, v12.4s, v9.4s -mul v4.4S, v4.4S,v19.s[0] -str q1, [x0, #400] -ldr q1, [x0, #560] -sqrdmulh v28.4S, v1.4S, v25.s[0] -add v12.4s, v12.4s, v9.4s -mul v1.4S, v1.4S,v19.s[0] -str q12, [x0, #384] -ldr q12, [x17, #+416] -ldr q9, [x17, #+432] -ldr q10, [x0, #608] -sqrdmulh v21.4S, v10.4S, v9.s[0] -sub v30.4s, v6.4s, v3.4s -mul v10.4S, v10.4S,v12.s[0] -str q30, [x0, #432] -ldr q30, [x0, #624] -sqrdmulh v20.4S, v30.4S, v9.s[0] -add v6.4s, v6.4s, v3.4s -mul v30.4S, v30.4S,v12.s[0] -str q6, [x0, #416] -ldr q6, [x17, #+448] -ldr q3, [x17, #+464] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v11.4s, v14.4s -sqrdmulh v24.4S, v22.4S, v3.s[0] -str q26, [x0, #464] -ldr q26, [x0, #688] -mla v1.4S, v28.4S, v31.s[0] -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v3.s[0] -str q11, [x0, #448] -ldr q11, [x17, #+480] -ldr q28, [x17, #+496] -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v8.4s -sqrdmulh v17.4S, v18.4S, v28.s[0] -str q21, [x0, #496] -ldr q21, [x0, #752] -mla v30.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v28.s[0] -str q15, [x0, #480] -ldr q15, [x0, #512] -ldr q20, [x0, #640] -mul v22.4S, v22.4S,v6.s[0] -sub v2.4s, v15.4s, v4.4s -ldr q27, [x0, #528] -mul v26.4S, v26.4S,v6.s[0] -add v15.4s, v15.4s, v4.4s -ldr q4, [x0, #656] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v1.4s -ldr q16, [x0, #576] -mla v26.4S, v14.4S, v31.s[0] -add v27.4s, v27.4s, v1.4s -ldr q1, [x0, #704] -mul v18.4S, v18.4S,v11.s[0] -sub v14.4s, v16.4s, v10.4s -ldr q7, [x0, #592] -mul v21.4S, v21.4S,v11.s[0] -add v16.4s, v16.4s, v10.4s -ldr q10, [x0, #720] -mla v18.4S, v17.4S, v31.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v7.4s, v30.4s -sqrdmulh v17.4S, v27.4S, v25.s[1] -add v7.4s, v7.4s, v30.4s -mul v27.4S, v27.4S,v19.s[1] -sqrdmulh v30.4S, v24.4S, v25.s[2] -sub v23.4s, v20.4s, v22.4s -mul v24.4S, v24.4S,v19.s[2] -add v20.4s, v20.4s, v22.4s -sqrdmulh v25.4S, v7.4S, v9.s[1] -sub v19.4s, v4.4s, v26.4s -mul v7.4S, v7.4S,v12.s[1] -add v4.4s, v4.4s, v26.4s -sqrdmulh v26.4S, v8.4S, v9.s[2] -sub v22.4s, v1.4s, v18.4s -mul v8.4S, v8.4S,v12.s[2] -add v1.4s, v1.4s, v18.4s -mla v27.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v21.4s -ldr q9, [x0, #992] -sqrdmulh v12.4S, v4.4S, v3.s[1] -add v10.4s, v10.4s, v21.4s -mla v24.4S, v30.4S, v31.s[0] -ldr q30, [x0, #928] -sqrdmulh v21.4S, v19.4S, v3.s[2] -sub v18.4s, v15.4s, v27.4s -mla v7.4S, v25.4S, v31.s[0] -ldr q25, [x0, #800] -sqrdmulh v13.4S, v10.4S, v28.s[1] -add v15.4s, v15.4s, v27.4s -str q18, [x0, #528] -mla v8.4S, v26.4S, v31.s[0] -ldr q26, [x17, #+512] -ldr q18, [x17, #+528] -sqrdmulh v27.4S, v17.4S, v28.s[2] -sub v5.4s, v2.4s, v24.4s -str q15, [x0, #512] -mul v4.4S, v4.4S,v6.s[1] -add v2.4s, v2.4s, v24.4s -mul v19.4S, v19.4S,v6.s[2] -str q5, [x0, #560] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v7.4s -mla v19.4S, v21.4S, v31.s[0] -str q2, [x0, #544] -mul v10.4S, v10.4S,v11.s[1] -str q12, [x0, #592] -mul v17.4S, v17.4S,v11.s[2] -add v16.4s, v16.4s, v7.4s -str q16, [x0, #576] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v8.4s -str q13, [x0, #624] -mla v17.4S, v27.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -str q14, [x0, #608] -sqrdmulh v28.4S, v25.4S, v18.s[0] -sub v11.4s, v20.4s, v4.4s -mul v25.4S, v25.4S,v26.s[0] -str q11, [x0, #656] -ldr q11, [x0, #816] -sqrdmulh v14.4S, v11.4S, v18.s[0] -add v20.4s, v20.4s, v4.4s -mul v11.4S, v11.4S,v26.s[0] -str q20, [x0, #640] -ldr q20, [x17, #+544] -ldr q4, [x17, #+560] -ldr q8, [x0, #864] -sqrdmulh v27.4S, v8.4S, v4.s[0] -sub v13.4s, v23.4s, v19.4s -mul v8.4S, v8.4S,v20.s[0] -str q13, [x0, #688] -ldr q13, [x0, #880] -sqrdmulh v16.4S, v13.4S, v4.s[0] -add v23.4s, v23.4s, v19.4s -mul v13.4S, v13.4S,v20.s[0] -str q23, [x0, #672] -ldr q23, [x17, #+576] -ldr q19, [x17, #+592] -mla v25.4S, v28.4S, v31.s[0] -sub v28.4s, v1.4s, v10.4s -sqrdmulh v7.4S, v30.4S, v19.s[0] -str q28, [x0, #720] -ldr q28, [x0, #944] -mla v11.4S, v14.4S, v31.s[0] -add v1.4s, v1.4s, v10.4s -sqrdmulh v10.4S, v28.4S, v19.s[0] -str q1, [x0, #704] -ldr q1, [x17, #+608] -ldr q14, [x17, #+624] -mla v8.4S, v27.4S, v31.s[0] -sub v27.4s, v22.4s, v17.4s -sqrdmulh v12.4S, v9.4S, v14.s[0] -str q27, [x0, #752] -ldr q27, [x0, #1008] -mla v13.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v27.4S, v14.s[0] -str q22, [x0, #736] -ldr q22, [x0, #768] -ldr q16, [x0, #896] -mul v30.4S, v30.4S,v23.s[0] -sub v3.4s, v22.4s, v25.4s -ldr q6, [x0, #784] -mul v28.4S, v28.4S,v23.s[0] -add v22.4s, v22.4s, v25.4s -ldr q25, [x0, #912] -mla v30.4S, v7.4S, v31.s[0] -sub v7.4s, v6.4s, v11.4s -ldr q2, [x0, #832] -mla v28.4S, v10.4S, v31.s[0] -add v6.4s, v6.4s, v11.4s -ldr q11, [x0, #960] -mul v9.4S, v9.4S,v1.s[0] -sub v10.4s, v2.4s, v8.4s -ldr q21, [x0, #848] -mul v27.4S, v27.4S,v1.s[0] -add v2.4s, v2.4s, v8.4s -ldr q8, [x0, #976] -mla v9.4S, v12.4S, v31.s[0] -mla v27.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v13.4s -sqrdmulh v12.4S, v6.4S, v18.s[1] -add v21.4s, v21.4s, v13.4s -mul v6.4S, v6.4S,v26.s[1] -sqrdmulh v13.4S, v7.4S, v18.s[2] -sub v5.4s, v16.4s, v30.4s -mul v7.4S, v7.4S,v26.s[2] -add v16.4s, v16.4s, v30.4s -sqrdmulh v18.4S, v21.4S, v4.s[1] -sub v26.4s, v25.4s, v28.4s -mul v21.4S, v21.4S,v20.s[1] -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v4.s[2] -sub v30.4s, v11.4s, v9.4s -mul v17.4S, v17.4S,v20.s[2] -add v11.4s, v11.4s, v9.4s -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v27.4s -sqrdmulh v4.4S, v25.4S, v19.s[1] -add v8.4s, v8.4s, v27.4s -mla v7.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v26.4S, v19.s[2] -sub v27.4s, v22.4s, v6.4s -mla v21.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v8.4S, v14.s[1] -add v22.4s, v22.4s, v6.4s -str q27, [x0, #784] -mla v17.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v12.4S, v14.s[2] -sub v27.4s, v3.4s, v7.4s -str q22, [x0, #768] -mul v25.4S, v25.4S,v23.s[1] -add v3.4s, v3.4s, v7.4s -mul v26.4S, v26.4S,v23.s[2] -str q27, [x0, #816] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v21.4s -mla v26.4S, v13.4S, v31.s[0] -str q3, [x0, #800] -mul v8.4S, v8.4S,v1.s[1] -str q4, [x0, #848] -mul v12.4S, v12.4S,v1.s[2] -add v2.4s, v2.4s, v21.4s -str q2, [x0, #832] -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v17.4s -str q18, [x0, #880] -mla v12.4S, v28.4S, v31.s[0] -add v10.4s, v10.4s, v17.4s -str q10, [x0, #864] -sub v14.4s, v16.4s, v25.4s -str q14, [x0, #912] -add v16.4s, v16.4s, v25.4s -str q16, [x0, #896] -sub v16.4s, v5.4s, v26.4s -str q16, [x0, #944] -add v5.4s, v5.4s, v26.4s -str q5, [x0, #928] -sub v5.4s, v11.4s, v8.4s -str q5, [x0, #976] -add v11.4s, v11.4s, v8.4s -str q11, [x0, #960] -sub v11.4s, v30.4s, v12.4s -str q11, [x0, #1008] -add v30.4s, v30.4s, v12.4s -str q30, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s deleted file mode 100644 index 2cf09d4..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_14_z4_7.s +++ /dev/null @@ -1,1578 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_14_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #928] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #992] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #800] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #864] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #544] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #608] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -nop -ldr q18, [x0, #672] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -nop -ldr q17, [x0, #736] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -nop -ldr q16, [x0, #416] -ldr q3, [x0, #480] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #288] -ldr q1, [x0, #352] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #32] -ldr q25, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #160] -ldr q15, [x0, #224] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v28.s[1] -nop -mul v16.4S, v16.4S,v29.s[1] -nop -sqrdmulh v21.4S, v3.4S, v28.s[1] -sub v14.4s, v26.4s, v20.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v30.4S, v30.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v1.4S, v1.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v2.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v3.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v27.4S, v28.s[2] -nop -mla v30.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v0.4S, v28.s[2] -nop -mla v1.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v23.4S, v28.s[2] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v2.4S, v2.4S,v29.s[2] -sub v10.4s, v24.4s, v16.4s -mul v27.4S, v27.4S,v29.s[2] -add v24.4s, v24.4s, v16.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v3.4s -mla v27.4S, v21.4S, v31.s[0] -add v15.4s, v15.4s, v3.4s -mul v0.4S, v0.4S,v29.s[2] -sub v3.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v29.s[2] -add v26.4s, v26.4s, v30.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v1.4s -mla v23.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v11.s[1] -nop -mul v10.4S, v10.4S,v17.s[1] -nop -sqrdmulh v19.4S, v18.4S, v11.s[1] -sub v30.4s, v12.4s, v2.4s -mul v18.4S, v18.4S,v17.s[1] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v11.s[0] -sub v21.4s, v22.4s, v27.4s -mul v24.4S, v24.4S,v17.s[0] -add v22.4s, v22.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v11.s[0] -sub v16.4s, v14.4s, v0.4s -mul v15.4S, v15.4S,v17.s[0] -add v14.4s, v14.4s, v0.4s -ldr q0, [x17, #+64] -ldr q9, [x17, #+80] -mla v10.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v23.4s -sqrdmulh v8.4S, v12.4S, v11.s[2] -add v13.4s, v13.4s, v23.4s -mla v18.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v22.4S, v11.s[2] -nop -mla v24.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v30.4S, v11.s[3] -nop -mla v15.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v21.4S, v11.s[3] -nop -ldr q23, [x17, #+96] -ldr q7, [x17, #+112] -mul v12.4S, v12.4S,v17.s[2] -sub v6.4s, v3.4s, v10.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v10.4s -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v22.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v30.4S, v30.4S,v17.s[3] -sub v18.4s, v26.4s, v24.4s -mul v21.4S, v21.4S,v17.s[3] -add v26.4s, v26.4s, v24.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v15.4s -mla v21.4S, v27.4S, v31.s[0] -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v8.4S, v9.s[3] -nop -mul v8.4S, v8.4S,v0.s[3] -nop -sqrdmulh v27.4S, v20.4S, v9.s[2] -sub v24.4s, v14.4s, v12.4s -mul v20.4S, v20.4S,v0.s[2] -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v9.s[1] -sub v19.4s, v13.4s, v22.4s -mul v2.4S, v2.4S,v0.s[1] -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[0] -sub v10.4s, v16.4s, v30.4s -mul v25.4S, v25.4S,v0.s[0] -add v16.4s, v16.4s, v30.4s -mla v8.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v21.4s -sqrdmulh v30.4S, v13.4S, v7.s[0] -add v1.4s, v1.4s, v21.4s -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v6.4s, v8.4s -sqrdmulh v21.4S, v19.4S, v7.s[1] -add v6.4s, v6.4s, v8.4s -mla v2.4S, v12.4S, v31.s[0] -sub v12.4s, v3.4s, v20.4s -sqrdmulh v8.4S, v1.4S, v7.s[2] -add v3.4s, v3.4s, v20.4s -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v2.4s -sqrdmulh v20.4S, v15.4S, v7.s[3] -add v18.4s, v18.4s, v2.4s -mul v13.4S, v13.4S,v23.s[0] -sub v2.4s, v26.4s, v25.4s -mul v19.4S, v19.4S,v23.s[1] -add v26.4s, v26.4s, v25.4s -mla v13.4S, v30.4S, v31.s[0] -str q12, [x0, #352] -mla v19.4S, v21.4S, v31.s[0] -str q3, [x0, #288] -mul v1.4S, v1.4S,v23.s[2] -str q27, [x0, #480] -mul v15.4S, v15.4S,v23.s[3] -str q6, [x0, #416] -mla v1.4S, v8.4S, v31.s[0] -str q22, [x0, #224] -mla v15.4S, v20.4S, v31.s[0] -str q18, [x0, #160] -ldr q18, [x0, #944] -sqrdmulh v20.4S, v18.4S, v28.s[0] -str q2, [x0, #96] -mul v18.4S, v18.4S,v29.s[0] -str q26, [x0, #32] -ldr q26, [x0, #1008] -sqrdmulh v2.4S, v26.4S, v28.s[0] -sub v22.4s, v14.4s, v13.4s -str q22, [x0, #608] -mul v26.4S, v26.4S,v29.s[0] -add v14.4s, v14.4s, v13.4s -ldr q13, [x0, #816] -sqrdmulh v22.4S, v13.4S, v28.s[0] -sub v8.4s, v24.4s, v19.4s -str q14, [x0, #544] -mul v13.4S, v13.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #880] -sqrdmulh v14.4S, v19.4S, v28.s[0] -sub v6.4s, v16.4s, v1.4s -str q8, [x0, #736] -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v1.4s -ldr q1, [x0, #560] -mla v18.4S, v20.4S, v31.s[0] -sub v20.4s, v10.4s, v15.4s -str q24, [x0, #672] -sqrdmulh v24.4S, v1.4S, v28.s[0] -add v10.4s, v10.4s, v15.4s -ldr q15, [x0, #624] -mla v26.4S, v2.4S, v31.s[0] -str q6, [x0, #864] -sqrdmulh v6.4S, v15.4S, v28.s[0] -nop -ldr q2, [x0, #688] -mla v13.4S, v22.4S, v31.s[0] -str q16, [x0, #800] -sqrdmulh v16.4S, v2.4S, v28.s[0] -nop -ldr q22, [x0, #752] -mla v19.4S, v14.4S, v31.s[0] -str q20, [x0, #992] -sqrdmulh v20.4S, v22.4S, v28.s[0] -nop -ldr q14, [x0, #432] -ldr q8, [x0, #496] -mul v1.4S, v1.4S,v29.s[0] -sub v27.4s, v14.4s, v18.4s -str q10, [x0, #928] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v18.4s -ldr q18, [x0, #304] -ldr q10, [x0, #368] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v8.4s, v26.4s -mla v15.4S, v6.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -ldr q26, [x0, #48] -ldr q6, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -sub v3.4s, v18.4s, v13.4s -mul v22.4S, v22.4S,v29.s[0] -add v18.4s, v18.4s, v13.4s -ldr q13, [x0, #176] -ldr q21, [x0, #240] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v19.4s -mla v22.4S, v20.4S, v31.s[0] -add v10.4s, v10.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v28.s[1] -nop -mul v14.4S, v14.4S,v29.s[1] -nop -sqrdmulh v20.4S, v8.4S, v28.s[1] -sub v12.4s, v26.4s, v1.4s -mul v8.4S, v8.4S,v29.s[1] -add v26.4s, v26.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v28.s[1] -sub v30.4s, v6.4s, v15.4s -mul v18.4S, v18.4S,v29.s[1] -add v6.4s, v6.4s, v15.4s -sqrdmulh v15.4S, v10.4S, v28.s[1] -sub v25.4s, v13.4s, v2.4s -mul v10.4S, v10.4S,v29.s[1] -add v13.4s, v13.4s, v2.4s -mla v14.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v22.4s -sqrdmulh v2.4S, v27.4S, v28.s[2] -add v21.4s, v21.4s, v22.4s -mla v8.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v24.4S, v28.s[2] -nop -mla v18.4S, v1.4S, v31.s[0] -nop -sqrdmulh v1.4S, v3.4S, v28.s[2] -nop -mla v10.4S, v15.4S, v31.s[0] -nop -sqrdmulh v15.4S, v16.4S, v28.s[2] -nop -mul v27.4S, v27.4S,v29.s[2] -sub v22.4s, v13.4s, v14.4s -mul v24.4S, v24.4S,v29.s[2] -add v13.4s, v13.4s, v14.4s -mla v27.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v8.4s -mla v24.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v8.4s -mul v3.4S, v3.4S,v29.s[2] -sub v8.4s, v26.4s, v18.4s -mul v16.4S, v16.4S,v29.s[2] -add v26.4s, v26.4s, v18.4s -mla v3.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v11.s[1] -nop -mul v22.4S, v22.4S,v17.s[1] -nop -sqrdmulh v15.4S, v2.4S, v11.s[1] -sub v18.4s, v25.4s, v27.4s -mul v2.4S, v2.4S,v17.s[1] -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v11.s[0] -sub v20.4s, v19.4s, v24.4s -mul v13.4S, v13.4S,v17.s[0] -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v11.s[0] -sub v14.4s, v12.4s, v3.4s -mul v21.4S, v21.4S,v17.s[0] -add v12.4s, v12.4s, v3.4s -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v16.4s -sqrdmulh v3.4S, v25.4S, v11.s[2] -add v30.4s, v30.4s, v16.4s -mla v2.4S, v15.4S, v31.s[0] -nop -sqrdmulh v15.4S, v19.4S, v11.s[2] -nop -mla v13.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v18.4S, v11.s[3] -nop -mla v21.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v20.4S, v11.s[3] -nop -mul v25.4S, v25.4S,v17.s[2] -sub v16.4s, v8.4s, v22.4s -mul v19.4S, v19.4S,v17.s[2] -add v8.4s, v8.4s, v22.4s -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v1.4s, v2.4s -mla v19.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v2.4s -mul v18.4S, v18.4S,v17.s[3] -sub v2.4s, v26.4s, v13.4s -mul v20.4S, v20.4S,v17.s[3] -add v26.4s, v26.4s, v13.4s -mla v18.4S, v27.4S, v31.s[0] -sub v27.4s, v6.4s, v21.4s -mla v20.4S, v24.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v9.s[3] -nop -mul v3.4S, v3.4S,v0.s[3] -nop -sqrdmulh v24.4S, v1.4S, v9.s[2] -sub v13.4s, v12.4s, v25.4s -mul v1.4S, v1.4S,v0.s[2] -add v12.4s, v12.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v9.s[1] -sub v15.4s, v30.4s, v19.4s -mul v27.4S, v27.4S,v0.s[1] -add v30.4s, v30.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[0] -sub v22.4s, v14.4s, v18.4s -mul v6.4S, v6.4S,v0.s[0] -add v14.4s, v14.4s, v18.4s -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v20.4s -sqrdmulh v18.4S, v30.4S, v7.s[0] -add v10.4s, v10.4s, v20.4s -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v3.4s -sqrdmulh v20.4S, v15.4S, v7.s[1] -add v16.4s, v16.4s, v3.4s -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v8.4s, v1.4s -sqrdmulh v3.4S, v10.4S, v7.s[2] -add v8.4s, v8.4s, v1.4s -mla v6.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v27.4s -sqrdmulh v1.4S, v21.4S, v7.s[3] -add v2.4s, v2.4s, v27.4s -mul v30.4S, v30.4S,v23.s[0] -sub v27.4s, v26.4s, v6.4s -mul v15.4S, v15.4S,v23.s[1] -add v26.4s, v26.4s, v6.4s -mla v30.4S, v18.4S, v31.s[0] -str q25, [x0, #368] -mla v15.4S, v20.4S, v31.s[0] -str q8, [x0, #304] -mul v10.4S, v10.4S,v23.s[2] -str q24, [x0, #496] -mul v21.4S, v21.4S,v23.s[3] -str q16, [x0, #432] -mla v10.4S, v3.4S, v31.s[0] -str q19, [x0, #240] -mla v21.4S, v1.4S, v31.s[0] -str q2, [x0, #176] -ldr q2, [x0, #896] -sqrdmulh v1.4S, v2.4S, v28.s[0] -str q27, [x0, #112] -mul v2.4S, v2.4S,v29.s[0] -str q26, [x0, #48] -ldr q26, [x0, #960] -sqrdmulh v27.4S, v26.4S, v28.s[0] -sub v19.4s, v12.4s, v30.4s -str q19, [x0, #624] -mul v26.4S, v26.4S,v29.s[0] -add v12.4s, v12.4s, v30.4s -ldr q30, [x0, #768] -sqrdmulh v19.4S, v30.4S, v28.s[0] -sub v3.4s, v13.4s, v15.4s -str q12, [x0, #560] -mul v30.4S, v30.4S,v29.s[0] -add v13.4s, v13.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v12.4S, v15.4S, v28.s[0] -sub v16.4s, v14.4s, v10.4s -str q3, [x0, #752] -mul v15.4S, v15.4S,v29.s[0] -add v14.4s, v14.4s, v10.4s -ldr q10, [x0, #512] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v21.4s -str q13, [x0, #688] -sqrdmulh v13.4S, v10.4S, v28.s[0] -add v22.4s, v22.4s, v21.4s -ldr q21, [x0, #576] -mla v26.4S, v27.4S, v31.s[0] -str q16, [x0, #880] -sqrdmulh v16.4S, v21.4S, v28.s[0] -nop -ldr q27, [x0, #640] -mla v30.4S, v19.4S, v31.s[0] -str q14, [x0, #816] -sqrdmulh v14.4S, v27.4S, v28.s[0] -nop -ldr q19, [x0, #704] -mla v15.4S, v12.4S, v31.s[0] -str q1, [x0, #1008] -sqrdmulh v1.4S, v19.4S, v28.s[0] -nop -ldr q12, [x0, #384] -ldr q3, [x0, #448] -mul v10.4S, v10.4S,v29.s[0] -sub v24.4s, v12.4s, v2.4s -str q22, [x0, #944] -mul v21.4S, v21.4S,v29.s[0] -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #256] -ldr q22, [x0, #320] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v3.4s, v26.4s -mla v21.4S, v16.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #0] -ldr q16, [x0, #64] -mul v27.4S, v27.4S,v29.s[0] -sub v8.4s, v2.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v2.4s, v2.4s, v30.4s -ldr q30, [x0, #128] -ldr q20, [x0, #192] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -mla v19.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v28.s[1] -nop -mul v12.4S, v12.4S,v29.s[1] -nop -sqrdmulh v1.4S, v3.4S, v28.s[1] -sub v25.4s, v26.4s, v10.4s -mul v3.4S, v3.4S,v29.s[1] -add v26.4s, v26.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v28.s[1] -sub v18.4s, v16.4s, v21.4s -mul v2.4S, v2.4S,v29.s[1] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v28.s[1] -sub v6.4s, v30.4s, v27.4s -mul v22.4S, v22.4S,v29.s[1] -add v30.4s, v30.4s, v27.4s -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v19.4s -sqrdmulh v27.4S, v24.4S, v28.s[2] -add v20.4s, v20.4s, v19.4s -mla v3.4S, v1.4S, v31.s[0] -nop -sqrdmulh v1.4S, v13.4S, v28.s[2] -nop -mla v2.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v8.4S, v28.s[2] -nop -mla v22.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v14.4S, v28.s[2] -nop -mul v24.4S, v24.4S,v29.s[2] -sub v19.4s, v30.4s, v12.4s -mul v13.4S, v13.4S,v29.s[2] -add v30.4s, v30.4s, v12.4s -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v3.4s -mla v13.4S, v1.4S, v31.s[0] -add v20.4s, v20.4s, v3.4s -mul v8.4S, v8.4S,v29.s[2] -sub v3.4s, v26.4s, v2.4s -mul v14.4S, v14.4S,v29.s[2] -add v26.4s, v26.4s, v2.4s -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v22.4s -mla v14.4S, v21.4S, v31.s[0] -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v11.s[1] -nop -mul v19.4S, v19.4S,v17.s[1] -nop -sqrdmulh v21.4S, v27.4S, v11.s[1] -sub v2.4s, v6.4s, v24.4s -mul v27.4S, v27.4S,v17.s[1] -add v6.4s, v6.4s, v24.4s -sqrdmulh v24.4S, v30.4S, v11.s[0] -sub v1.4s, v15.4s, v13.4s -mul v30.4S, v30.4S,v17.s[0] -add v15.4s, v15.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v11.s[0] -sub v12.4s, v25.4s, v8.4s -mul v20.4S, v20.4S,v17.s[0] -add v25.4s, v25.4s, v8.4s -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v14.4s -sqrdmulh v8.4S, v6.4S, v11.s[2] -add v18.4s, v18.4s, v14.4s -mla v27.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v15.4S, v11.s[2] -nop -mla v30.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v2.4S, v11.s[3] -nop -mla v20.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v1.4S, v11.s[3] -nop -mul v6.4S, v6.4S,v17.s[2] -sub v14.4s, v3.4s, v19.4s -mul v15.4S, v15.4S,v17.s[2] -add v3.4s, v3.4s, v19.4s -mla v6.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v27.4s -mla v15.4S, v21.4S, v31.s[0] -add v10.4s, v10.4s, v27.4s -mul v2.4S, v2.4S,v17.s[3] -sub v27.4s, v26.4s, v30.4s -mul v1.4S, v1.4S,v17.s[3] -add v26.4s, v26.4s, v30.4s -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v20.4s -mla v1.4S, v13.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v9.s[3] -nop -mul v8.4S, v8.4S,v0.s[3] -nop -sqrdmulh v13.4S, v10.4S, v9.s[2] -sub v30.4s, v25.4s, v6.4s -mul v10.4S, v10.4S,v0.s[2] -add v25.4s, v25.4s, v6.4s -sqrdmulh v6.4S, v24.4S, v9.s[1] -sub v21.4s, v18.4s, v15.4s -mul v24.4S, v24.4S,v0.s[1] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v9.s[0] -sub v19.4s, v12.4s, v2.4s -mul v16.4S, v16.4S,v0.s[0] -add v12.4s, v12.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v1.4s -sqrdmulh v2.4S, v18.4S, v7.s[0] -add v22.4s, v22.4s, v1.4s -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v8.4s -sqrdmulh v1.4S, v21.4S, v7.s[1] -add v14.4s, v14.4s, v8.4s -mla v24.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v10.4s -sqrdmulh v8.4S, v22.4S, v7.s[2] -add v3.4s, v3.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -sub v15.4s, v27.4s, v24.4s -sqrdmulh v10.4S, v20.4S, v7.s[3] -add v27.4s, v27.4s, v24.4s -mul v18.4S, v18.4S,v23.s[0] -sub v24.4s, v26.4s, v16.4s -mul v21.4S, v21.4S,v23.s[1] -add v26.4s, v26.4s, v16.4s -mla v18.4S, v2.4S, v31.s[0] -str q6, [x0, #320] -mla v21.4S, v1.4S, v31.s[0] -str q3, [x0, #256] -mul v22.4S, v22.4S,v23.s[2] -str q13, [x0, #448] -mul v20.4S, v20.4S,v23.s[3] -str q14, [x0, #384] -mla v22.4S, v8.4S, v31.s[0] -str q15, [x0, #192] -mla v20.4S, v10.4S, v31.s[0] -str q27, [x0, #128] -ldr q27, [x0, #912] -sqrdmulh v10.4S, v27.4S, v28.s[0] -str q24, [x0, #64] -mul v27.4S, v27.4S,v29.s[0] -str q26, [x0, #0] -ldr q26, [x0, #976] -sqrdmulh v24.4S, v26.4S, v28.s[0] -sub v15.4s, v25.4s, v18.4s -str q15, [x0, #576] -mul v26.4S, v26.4S,v29.s[0] -add v25.4s, v25.4s, v18.4s -ldr q18, [x0, #784] -sqrdmulh v15.4S, v18.4S, v28.s[0] -sub v8.4s, v30.4s, v21.4s -str q25, [x0, #512] -mul v18.4S, v18.4S,v29.s[0] -add v30.4s, v30.4s, v21.4s -ldr q21, [x0, #848] -sqrdmulh v25.4S, v21.4S, v28.s[0] -sub v14.4s, v12.4s, v22.4s -str q8, [x0, #704] -mul v21.4S, v21.4S,v29.s[0] -add v12.4s, v12.4s, v22.4s -ldr q22, [x0, #528] -mla v27.4S, v10.4S, v31.s[0] -sub v10.4s, v19.4s, v20.4s -str q30, [x0, #640] -sqrdmulh v30.4S, v22.4S, v28.s[0] -add v19.4s, v19.4s, v20.4s -ldr q20, [x0, #592] -mla v26.4S, v24.4S, v31.s[0] -str q14, [x0, #832] -sqrdmulh v14.4S, v20.4S, v28.s[0] -nop -ldr q24, [x0, #656] -mla v18.4S, v15.4S, v31.s[0] -str q12, [x0, #768] -sqrdmulh v12.4S, v24.4S, v28.s[0] -nop -ldr q15, [x0, #720] -mla v21.4S, v25.4S, v31.s[0] -str q10, [x0, #960] -sqrdmulh v10.4S, v15.4S, v28.s[0] -nop -ldr q25, [x0, #400] -ldr q8, [x0, #464] -mul v22.4S, v22.4S,v29.s[0] -sub v13.4s, v25.4s, v27.4s -str q19, [x0, #896] -mul v20.4S, v20.4S,v29.s[0] -add v25.4s, v25.4s, v27.4s -ldr q27, [x0, #272] -ldr q19, [x0, #336] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v26.4s -mla v20.4S, v14.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -ldr q26, [x0, #16] -ldr q14, [x0, #80] -mul v24.4S, v24.4S,v29.s[0] -sub v3.4s, v27.4s, v18.4s -mul v15.4S, v15.4S,v29.s[0] -add v27.4s, v27.4s, v18.4s -ldr q18, [x0, #144] -ldr q1, [x0, #208] -mla v24.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v21.4s -mla v15.4S, v10.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v25.4S, v28.s[1] -nop -mul v25.4S, v25.4S,v29.s[1] -nop -sqrdmulh v10.4S, v8.4S, v28.s[1] -sub v6.4s, v26.4s, v22.4s -mul v8.4S, v8.4S,v29.s[1] -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v27.4S, v28.s[1] -sub v2.4s, v14.4s, v20.4s -mul v27.4S, v27.4S,v29.s[1] -add v14.4s, v14.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v28.s[1] -sub v16.4s, v18.4s, v24.4s -mul v19.4S, v19.4S,v29.s[1] -add v18.4s, v18.4s, v24.4s -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v15.4s -sqrdmulh v24.4S, v13.4S, v28.s[2] -add v1.4s, v1.4s, v15.4s -mla v8.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v30.4S, v28.s[2] -nop -mla v27.4S, v22.4S, v31.s[0] -nop -sqrdmulh v22.4S, v3.4S, v28.s[2] -nop -mla v19.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v12.4S, v28.s[2] -nop -mul v13.4S, v13.4S,v29.s[2] -sub v15.4s, v18.4s, v25.4s -mul v30.4S, v30.4S,v29.s[2] -add v18.4s, v18.4s, v25.4s -mla v13.4S, v24.4S, v31.s[0] -sub v24.4s, v1.4s, v8.4s -mla v30.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -mul v3.4S, v3.4S,v29.s[2] -sub v8.4s, v26.4s, v27.4s -mul v12.4S, v12.4S,v29.s[2] -add v26.4s, v26.4s, v27.4s -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v19.4s -mla v12.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v28.4S, v15.4S, v11.s[1] -nop -mul v15.4S, v15.4S,v17.s[1] -nop -sqrdmulh v29.4S, v24.4S, v11.s[1] -sub v19.4s, v16.4s, v13.4s -mul v24.4S, v24.4S,v17.s[1] -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v11.s[0] -sub v20.4s, v21.4s, v30.4s -mul v18.4S, v18.4S,v17.s[0] -add v21.4s, v21.4s, v30.4s -sqrdmulh v30.4S, v1.4S, v11.s[0] -sub v27.4s, v6.4s, v3.4s -mul v1.4S, v1.4S,v17.s[0] -add v6.4s, v6.4s, v3.4s -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v2.4s, v12.4s -sqrdmulh v3.4S, v16.4S, v11.s[2] -add v2.4s, v2.4s, v12.4s -mla v24.4S, v29.4S, v31.s[0] -nop -sqrdmulh v29.4S, v21.4S, v11.s[2] -nop -mla v18.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v19.4S, v11.s[3] -nop -mla v1.4S, v30.4S, v31.s[0] -nop -sqrdmulh v30.4S, v20.4S, v11.s[3] -nop -mul v16.4S, v16.4S,v17.s[2] -sub v12.4s, v8.4s, v15.4s -mul v21.4S, v21.4S,v17.s[2] -add v8.4s, v8.4s, v15.4s -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v24.4s -mla v21.4S, v29.4S, v31.s[0] -add v22.4s, v22.4s, v24.4s -mul v19.4S, v19.4S,v17.s[3] -sub v24.4s, v26.4s, v18.4s -mul v20.4S, v20.4S,v17.s[3] -add v26.4s, v26.4s, v18.4s -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v1.4s -mla v20.4S, v30.4S, v31.s[0] -add v14.4s, v14.4s, v1.4s -sqrdmulh v11.4S, v3.4S, v9.s[3] -nop -mul v3.4S, v3.4S,v0.s[3] -nop -sqrdmulh v17.4S, v22.4S, v9.s[2] -sub v1.4s, v6.4s, v16.4s -mul v22.4S, v22.4S,v0.s[2] -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v9.s[1] -sub v30.4s, v2.4s, v21.4s -mul v13.4S, v13.4S,v0.s[1] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v9.s[0] -sub v18.4s, v27.4s, v19.4s -mul v14.4S, v14.4S,v0.s[0] -add v27.4s, v27.4s, v19.4s -mla v3.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v20.4s -sqrdmulh v9.4S, v2.4S, v7.s[0] -add v28.4s, v28.4s, v20.4s -mla v22.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v3.4s -sqrdmulh v20.4S, v30.4S, v7.s[1] -add v12.4s, v12.4s, v3.4s -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v22.4s -sqrdmulh v3.4S, v28.4S, v7.s[2] -add v8.4s, v8.4s, v22.4s -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v24.4s, v13.4s -sqrdmulh v22.4S, v11.4S, v7.s[3] -add v24.4s, v24.4s, v13.4s -mul v2.4S, v2.4S,v23.s[0] -sub v13.4s, v26.4s, v14.4s -mul v30.4S, v30.4S,v23.s[1] -add v26.4s, v26.4s, v14.4s -mla v2.4S, v9.4S, v31.s[0] -str q16, [x0, #336] -mla v30.4S, v20.4S, v31.s[0] -str q8, [x0, #272] -mul v28.4S, v28.4S,v23.s[2] -str q17, [x0, #464] -mul v11.4S, v11.4S,v23.s[3] -str q12, [x0, #400] -mla v28.4S, v3.4S, v31.s[0] -str q21, [x0, #208] -mla v11.4S, v22.4S, v31.s[0] -str q24, [x0, #144] -str q13, [x0, #80] -str q26, [x0, #16] -sub v26.4s, v6.4s, v2.4s -str q26, [x0, #592] -add v6.4s, v6.4s, v2.4s -sub v2.4s, v1.4s, v30.4s -str q6, [x0, #528] -add v1.4s, v1.4s, v30.4s -sub v30.4s, v27.4s, v28.4s -str q2, [x0, #720] -add v27.4s, v27.4s, v28.4s -sub v28.4s, v18.4s, v11.4s -str q1, [x0, #656] -add v18.4s, v18.4s, v11.4s -str q30, [x0, #848] -str q27, [x0, #784] -str q28, [x0, #976] -str q18, [x0, #912] -ldr q4, [x0, #224] -ldr q5, [x0, #160] -ldr q25, [x0, #32] -ldr q10, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v29.4S, v25.4S, v15.s[0] -mul v25.4S, v25.4S,v10.s[0] -ldr q19, [x0, #48] -sqrdmulh v0.4S, v19.4S, v15.s[0] -mul v19.4S, v19.4S,v10.s[0] -ldr q14, [x17, #+160] -ldr q9, [x17, #+176] -ldr q16, [x0, #96] -sqrdmulh v20.4S, v16.4S, v9.s[0] -mul v16.4S, v16.4S,v14.s[0] -ldr q8, [x0, #112] -sqrdmulh v17.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v14.s[0] -ldr q12, [x17, #+192] -ldr q3, [x17, #+208] -mla v25.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v5.4S, v3.s[0] -ldr q21, [x0, #176] -mla v19.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v21.4S, v3.s[0] -ldr q22, [x17, #+224] -ldr q24, [x17, #+240] -mla v16.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v4.4S, v24.s[0] -ldr q23, [x0, #240] -mla v8.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v23.4S, v24.s[0] -ldr q7, [x0, #0] -ldr q13, [x0, #128] -mul v5.4S, v5.4S,v12.s[0] -sub v26.4s, v7.4s, v25.4s -ldr q6, [x0, #16] -mul v21.4S, v21.4S,v12.s[0] -add v7.4s, v7.4s, v25.4s -ldr q25, [x0, #144] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v19.4s -ldr q2, [x0, #64] -mla v21.4S, v0.4S, v31.s[0] -add v6.4s, v6.4s, v19.4s -ldr q19, [x0, #192] -mul v4.4S, v4.4S,v22.s[0] -sub v0.4s, v2.4s, v16.4s -ldr q1, [x0, #80] -mul v23.4S, v23.4S,v22.s[0] -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #208] -mla v4.4S, v20.4S, v31.s[0] -mla v23.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v8.4s -sqrdmulh v20.4S, v6.4S, v15.s[1] -add v1.4s, v1.4s, v8.4s -mul v6.4S, v6.4S,v10.s[1] -sqrdmulh v8.4S, v29.4S, v15.s[2] -sub v11.4s, v13.4s, v5.4s -mul v29.4S, v29.4S,v10.s[2] -add v13.4s, v13.4s, v5.4s -sqrdmulh v15.4S, v1.4S, v9.s[1] -sub v10.4s, v25.4s, v21.4s -mul v1.4S, v1.4S,v14.s[1] -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v9.s[2] -sub v5.4s, v19.4s, v4.4s -mul v17.4S, v17.4S,v14.s[2] -add v19.4s, v19.4s, v4.4s -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v23.4s -ldr q9, [x0, #480] -sqrdmulh v14.4S, v25.4S, v3.s[1] -add v16.4s, v16.4s, v23.4s -mla v29.4S, v8.4S, v31.s[0] -ldr q8, [x0, #416] -sqrdmulh v23.4S, v10.4S, v3.s[2] -sub v4.4s, v7.4s, v6.4s -mla v1.4S, v15.4S, v31.s[0] -ldr q15, [x0, #288] -sqrdmulh v30.4S, v16.4S, v24.s[1] -add v7.4s, v7.4s, v6.4s -str q4, [x0, #16] -mla v17.4S, v21.4S, v31.s[0] -ldr q21, [x17, #+256] -ldr q4, [x17, #+272] -sqrdmulh v6.4S, v20.4S, v24.s[2] -sub v27.4s, v26.4s, v29.4s -str q7, [x0, #0] -mul v25.4S, v25.4S,v12.s[1] -add v26.4s, v26.4s, v29.4s -mul v10.4S, v10.4S,v12.s[2] -str q27, [x0, #48] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v1.4s -mla v10.4S, v23.4S, v31.s[0] -str q26, [x0, #32] -mul v16.4S, v16.4S,v22.s[1] -str q14, [x0, #80] -mul v20.4S, v20.4S,v22.s[2] -add v2.4s, v2.4s, v1.4s -str q2, [x0, #64] -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v0.4s, v17.4s -str q30, [x0, #112] -mla v20.4S, v6.4S, v31.s[0] -add v0.4s, v0.4s, v17.4s -str q0, [x0, #96] -sqrdmulh v24.4S, v15.4S, v4.s[0] -sub v22.4s, v13.4s, v25.4s -mul v15.4S, v15.4S,v21.s[0] -str q22, [x0, #144] -ldr q22, [x0, #304] -sqrdmulh v0.4S, v22.4S, v4.s[0] -add v13.4s, v13.4s, v25.4s -mul v22.4S, v22.4S,v21.s[0] -str q13, [x0, #128] -ldr q13, [x17, #+288] -ldr q25, [x17, #+304] -ldr q17, [x0, #352] -sqrdmulh v6.4S, v17.4S, v25.s[0] -sub v30.4s, v11.4s, v10.4s -mul v17.4S, v17.4S,v13.s[0] -str q30, [x0, #176] -ldr q30, [x0, #368] -sqrdmulh v2.4S, v30.4S, v25.s[0] -add v11.4s, v11.4s, v10.4s -mul v30.4S, v30.4S,v13.s[0] -str q11, [x0, #160] -ldr q11, [x17, #+320] -ldr q10, [x17, #+336] -mla v15.4S, v24.4S, v31.s[0] -sub v24.4s, v19.4s, v16.4s -sqrdmulh v1.4S, v8.4S, v10.s[0] -str q24, [x0, #208] -ldr q24, [x0, #432] -mla v22.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v16.4s -sqrdmulh v16.4S, v24.4S, v10.s[0] -str q19, [x0, #192] -ldr q19, [x17, #+352] -ldr q0, [x17, #+368] -mla v17.4S, v6.4S, v31.s[0] -sub v6.4s, v5.4s, v20.4s -sqrdmulh v14.4S, v9.4S, v0.s[0] -str q6, [x0, #240] -ldr q6, [x0, #496] -mla v30.4S, v2.4S, v31.s[0] -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v0.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q2, [x0, #384] -mul v8.4S, v8.4S,v11.s[0] -sub v3.4s, v5.4s, v15.4s -ldr q12, [x0, #272] -mul v24.4S, v24.4S,v11.s[0] -add v5.4s, v5.4s, v15.4s -ldr q15, [x0, #400] -mla v8.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v22.4s -ldr q26, [x0, #320] -mla v24.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -ldr q22, [x0, #448] -mul v9.4S, v9.4S,v19.s[0] -sub v16.4s, v26.4s, v17.4s -ldr q23, [x0, #336] -mul v6.4S, v6.4S,v19.s[0] -add v26.4s, v26.4s, v17.4s -ldr q17, [x0, #464] -mla v9.4S, v14.4S, v31.s[0] -mla v6.4S, v20.4S, v31.s[0] -sub v20.4s, v23.4s, v30.4s -sqrdmulh v14.4S, v12.4S, v4.s[1] -add v23.4s, v23.4s, v30.4s -mul v12.4S, v12.4S,v21.s[1] -sqrdmulh v30.4S, v1.4S, v4.s[2] -sub v27.4s, v2.4s, v8.4s -mul v1.4S, v1.4S,v21.s[2] -add v2.4s, v2.4s, v8.4s -sqrdmulh v4.4S, v23.4S, v25.s[1] -sub v21.4s, v15.4s, v24.4s -mul v23.4S, v23.4S,v13.s[1] -add v15.4s, v15.4s, v24.4s -sqrdmulh v24.4S, v20.4S, v25.s[2] -sub v8.4s, v22.4s, v9.4s -mul v20.4S, v20.4S,v13.s[2] -add v22.4s, v22.4s, v9.4s -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v17.4s, v6.4s -ldr q25, [x0, #736] -sqrdmulh v13.4S, v15.4S, v10.s[1] -add v17.4s, v17.4s, v6.4s -mla v1.4S, v30.4S, v31.s[0] -ldr q30, [x0, #672] -sqrdmulh v6.4S, v21.4S, v10.s[2] -sub v9.4s, v5.4s, v12.4s -mla v23.4S, v4.4S, v31.s[0] -ldr q4, [x0, #544] -sqrdmulh v29.4S, v17.4S, v0.s[1] -add v5.4s, v5.4s, v12.4s -str q9, [x0, #272] -mla v20.4S, v24.4S, v31.s[0] -ldr q24, [x17, #+384] -ldr q9, [x17, #+400] -sqrdmulh v12.4S, v14.4S, v0.s[2] -sub v7.4s, v3.4s, v1.4s -str q5, [x0, #256] -mul v15.4S, v15.4S,v11.s[1] -add v3.4s, v3.4s, v1.4s -mul v21.4S, v21.4S,v11.s[2] -str q7, [x0, #304] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v26.4s, v23.4s -mla v21.4S, v6.4S, v31.s[0] -str q3, [x0, #288] -mul v17.4S, v17.4S,v19.s[1] -str q13, [x0, #336] -mul v14.4S, v14.4S,v19.s[2] -add v26.4s, v26.4s, v23.4s -str q26, [x0, #320] -mla v17.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v20.4s -str q29, [x0, #368] -mla v14.4S, v12.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -str q16, [x0, #352] -sqrdmulh v0.4S, v4.4S, v9.s[0] -sub v19.4s, v2.4s, v15.4s -mul v4.4S, v4.4S,v24.s[0] -str q19, [x0, #400] -ldr q19, [x0, #560] -sqrdmulh v16.4S, v19.4S, v9.s[0] -add v2.4s, v2.4s, v15.4s -mul v19.4S, v19.4S,v24.s[0] -str q2, [x0, #384] -ldr q2, [x17, #+416] -ldr q15, [x17, #+432] -ldr q20, [x0, #608] -sqrdmulh v12.4S, v20.4S, v15.s[0] -sub v29.4s, v27.4s, v21.4s -mul v20.4S, v20.4S,v2.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v26.4S, v29.4S, v15.s[0] -add v27.4s, v27.4s, v21.4s -mul v29.4S, v29.4S,v2.s[0] -str q27, [x0, #416] -ldr q27, [x17, #+448] -ldr q21, [x17, #+464] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v17.4s -sqrdmulh v23.4S, v30.4S, v21.s[0] -str q0, [x0, #464] -ldr q0, [x0, #688] -mla v19.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v21.s[0] -str q22, [x0, #448] -ldr q22, [x17, #+480] -ldr q16, [x17, #+496] -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v8.4s, v14.4s -sqrdmulh v13.4S, v25.4S, v16.s[0] -str q12, [x0, #496] -ldr q12, [x0, #752] -mla v29.4S, v26.4S, v31.s[0] -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v16.s[0] -str q8, [x0, #480] -ldr q8, [x0, #512] -ldr q26, [x0, #640] -mul v30.4S, v30.4S,v27.s[0] -sub v10.4s, v8.4s, v4.4s -ldr q11, [x0, #528] -mul v0.4S, v0.4S,v27.s[0] -add v8.4s, v8.4s, v4.4s -ldr q4, [x0, #656] -mla v30.4S, v23.4S, v31.s[0] -sub v23.4s, v11.4s, v19.4s -ldr q3, [x0, #576] -mla v0.4S, v17.4S, v31.s[0] -add v11.4s, v11.4s, v19.4s -ldr q19, [x0, #704] -mul v25.4S, v25.4S,v22.s[0] -sub v17.4s, v3.4s, v20.4s -ldr q6, [x0, #592] -mul v12.4S, v12.4S,v22.s[0] -add v3.4s, v3.4s, v20.4s -ldr q20, [x0, #720] -mla v25.4S, v13.4S, v31.s[0] -mla v12.4S, v14.4S, v31.s[0] -sub v14.4s, v6.4s, v29.4s -sqrdmulh v13.4S, v11.4S, v9.s[1] -add v6.4s, v6.4s, v29.4s -mul v11.4S, v11.4S,v24.s[1] -sqrdmulh v29.4S, v23.4S, v9.s[2] -sub v7.4s, v26.4s, v30.4s -mul v23.4S, v23.4S,v24.s[2] -add v26.4s, v26.4s, v30.4s -sqrdmulh v9.4S, v6.4S, v15.s[1] -sub v24.4s, v4.4s, v0.4s -mul v6.4S, v6.4S,v2.s[1] -add v4.4s, v4.4s, v0.4s -sqrdmulh v0.4S, v14.4S, v15.s[2] -sub v30.4s, v19.4s, v25.4s -mul v14.4S, v14.4S,v2.s[2] -add v19.4s, v19.4s, v25.4s -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v12.4s -ldr q15, [x0, #992] -sqrdmulh v2.4S, v4.4S, v21.s[1] -add v20.4s, v20.4s, v12.4s -mla v23.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v12.4S, v24.4S, v21.s[2] -sub v25.4s, v8.4s, v11.4s -mla v6.4S, v9.4S, v31.s[0] -ldr q9, [x0, #800] -sqrdmulh v1.4S, v20.4S, v16.s[1] -add v8.4s, v8.4s, v11.4s -str q25, [x0, #528] -mla v14.4S, v0.4S, v31.s[0] -ldr q0, [x17, #+512] -ldr q25, [x17, #+528] -sqrdmulh v11.4S, v13.4S, v16.s[2] -sub v5.4s, v10.4s, v23.4s -str q8, [x0, #512] -mul v4.4S, v4.4S,v27.s[1] -add v10.4s, v10.4s, v23.4s -mul v24.4S, v24.4S,v27.s[2] -str q5, [x0, #560] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v3.4s, v6.4s -mla v24.4S, v12.4S, v31.s[0] -str q10, [x0, #544] -mul v20.4S, v20.4S,v22.s[1] -str q2, [x0, #592] -mul v13.4S, v13.4S,v22.s[2] -add v3.4s, v3.4s, v6.4s -str q3, [x0, #576] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v14.4s -str q1, [x0, #624] -mla v13.4S, v11.4S, v31.s[0] -add v17.4s, v17.4s, v14.4s -str q17, [x0, #608] -sqrdmulh v16.4S, v9.4S, v25.s[0] -sub v22.4s, v26.4s, v4.4s -mul v9.4S, v9.4S,v0.s[0] -str q22, [x0, #656] -ldr q22, [x0, #816] -sqrdmulh v17.4S, v22.4S, v25.s[0] -add v26.4s, v26.4s, v4.4s -mul v22.4S, v22.4S,v0.s[0] -str q26, [x0, #640] -ldr q26, [x17, #+544] -ldr q4, [x17, #+560] -ldr q14, [x0, #864] -sqrdmulh v11.4S, v14.4S, v4.s[0] -sub v1.4s, v7.4s, v24.4s -mul v14.4S, v14.4S,v26.s[0] -str q1, [x0, #688] -ldr q1, [x0, #880] -sqrdmulh v3.4S, v1.4S, v4.s[0] -add v7.4s, v7.4s, v24.4s -mul v1.4S, v1.4S,v26.s[0] -str q7, [x0, #672] -ldr q7, [x17, #+576] -ldr q24, [x17, #+592] -mla v9.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v20.4s -sqrdmulh v6.4S, v29.4S, v24.s[0] -str q16, [x0, #720] -ldr q16, [x0, #944] -mla v22.4S, v17.4S, v31.s[0] -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v24.s[0] -str q19, [x0, #704] -ldr q19, [x17, #+608] -ldr q17, [x17, #+624] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v13.4s -sqrdmulh v2.4S, v15.4S, v17.s[0] -str q11, [x0, #752] -ldr q11, [x0, #1008] -mla v1.4S, v3.4S, v31.s[0] -add v30.4s, v30.4s, v13.4s -sqrdmulh v13.4S, v11.4S, v17.s[0] -str q30, [x0, #736] -ldr q30, [x0, #768] -ldr q3, [x0, #896] -mul v29.4S, v29.4S,v7.s[0] -sub v21.4s, v30.4s, v9.4s -ldr q27, [x0, #784] -mul v16.4S, v16.4S,v7.s[0] -add v30.4s, v30.4s, v9.4s -ldr q9, [x0, #912] -mla v29.4S, v6.4S, v31.s[0] -sub v6.4s, v27.4s, v22.4s -ldr q10, [x0, #832] -mla v16.4S, v20.4S, v31.s[0] -add v27.4s, v27.4s, v22.4s -ldr q22, [x0, #960] -mul v15.4S, v15.4S,v19.s[0] -sub v20.4s, v10.4s, v14.4s -ldr q12, [x0, #848] -mul v11.4S, v11.4S,v19.s[0] -add v10.4s, v10.4s, v14.4s -ldr q14, [x0, #976] -mla v15.4S, v2.4S, v31.s[0] -mla v11.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v1.4s -sqrdmulh v2.4S, v27.4S, v25.s[1] -add v12.4s, v12.4s, v1.4s -mul v27.4S, v27.4S,v0.s[1] -sqrdmulh v1.4S, v6.4S, v25.s[2] -sub v5.4s, v3.4s, v29.4s -mul v6.4S, v6.4S,v0.s[2] -add v3.4s, v3.4s, v29.4s -sqrdmulh v25.4S, v12.4S, v4.s[1] -sub v0.4s, v9.4s, v16.4s -mul v12.4S, v12.4S,v26.s[1] -add v9.4s, v9.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v4.s[2] -sub v29.4s, v22.4s, v15.4s -mul v13.4S, v13.4S,v26.s[2] -add v22.4s, v22.4s, v15.4s -mla v27.4S, v2.4S, v31.s[0] -sub v2.4s, v14.4s, v11.4s -sqrdmulh v4.4S, v9.4S, v24.s[1] -add v14.4s, v14.4s, v11.4s -mla v6.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v0.4S, v24.s[2] -sub v11.4s, v30.4s, v27.4s -mla v12.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v14.4S, v17.s[1] -add v30.4s, v30.4s, v27.4s -str q11, [x0, #784] -mla v13.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v2.4S, v17.s[2] -sub v11.4s, v21.4s, v6.4s -str q30, [x0, #768] -mul v9.4S, v9.4S,v7.s[1] -add v21.4s, v21.4s, v6.4s -mul v0.4S, v0.4S,v7.s[2] -str q11, [x0, #816] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v12.4s -mla v0.4S, v1.4S, v31.s[0] -str q21, [x0, #800] -mul v14.4S, v14.4S,v19.s[1] -str q4, [x0, #848] -mul v2.4S, v2.4S,v19.s[2] -add v10.4s, v10.4s, v12.4s -str q10, [x0, #832] -mla v14.4S, v25.4S, v31.s[0] -sub v25.4s, v20.4s, v13.4s -str q25, [x0, #880] -mla v2.4S, v16.4S, v31.s[0] -add v20.4s, v20.4s, v13.4s -str q20, [x0, #864] -sub v17.4s, v3.4s, v9.4s -str q17, [x0, #912] -add v3.4s, v3.4s, v9.4s -str q3, [x0, #896] -sub v3.4s, v5.4s, v0.4s -str q3, [x0, #944] -add v5.4s, v5.4s, v0.4s -str q5, [x0, #928] -sub v5.4s, v22.4s, v14.4s -str q5, [x0, #976] -add v22.4s, v22.4s, v14.4s -str q22, [x0, #960] -sub v22.4s, v29.4s, v2.4s -str q22, [x0, #1008] -add v29.4s, v29.4s, v2.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1548 -// Instruction count: 1544 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s deleted file mode 100644 index cbb2ab0..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_15_z4_7.s +++ /dev/null @@ -1,1578 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_15_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #992] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #736] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #672] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -nop -ldr q18, [x0, #608] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -nop -ldr q17, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v28.s[0] -nop -ldr q16, [x0, #480] -ldr q3, [x0, #416] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #352] -ldr q1, [x0, #288] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q25, [x0, #160] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v28.s[2] -nop -mul v2.4S, v2.4S,v29.s[2] -nop -sqrdmulh v21.4S, v27.4S, v28.s[2] -sub v14.4s, v26.4s, v20.4s -mul v27.4S, v27.4S,v29.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v28.s[2] -sub v13.4s, v25.4s, v19.4s -mul v0.4S, v0.4S,v29.s[2] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v23.4S, v28.s[2] -sub v12.4s, v24.4s, v18.4s -mul v23.4S, v23.4S,v29.s[2] -add v24.4s, v24.4s, v18.4s -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v16.4S, v28.s[1] -add v15.4s, v15.4s, v17.4s -mla v27.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v3.4S, v28.s[1] -nop -mla v0.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v30.4S, v28.s[1] -nop -mla v23.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v1.4S, v28.s[1] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v16.4S, v16.4S,v29.s[1] -sub v10.4s, v14.4s, v2.4s -mul v3.4S, v3.4S,v29.s[1] -add v14.4s, v14.4s, v2.4s -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v27.4s -mla v3.4S, v21.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -mul v30.4S, v30.4S,v29.s[1] -sub v27.4s, v12.4s, v0.4s -mul v1.4S, v1.4S,v29.s[1] -add v12.4s, v12.4s, v0.4s -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v22.4s, v23.4s -mla v1.4S, v19.4S, v31.s[0] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v10.4S, v11.s[3] -nop -mul v10.4S, v10.4S,v17.s[3] -nop -sqrdmulh v19.4S, v18.4S, v11.s[3] -sub v0.4s, v26.4s, v16.4s -mul v18.4S, v18.4S,v17.s[3] -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v11.s[2] -sub v21.4s, v25.4s, v3.4s -mul v14.4S, v14.4S,v17.s[2] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v13.4S, v11.s[2] -sub v2.4s, v24.4s, v30.4s -mul v13.4S, v13.4S,v17.s[2] -add v24.4s, v24.4s, v30.4s -ldr q30, [x17, #+96] -ldr q9, [x17, #+112] -mla v10.4S, v23.4S, v31.s[0] -sub v23.4s, v15.4s, v1.4s -sqrdmulh v8.4S, v0.4S, v11.s[1] -add v15.4s, v15.4s, v1.4s -mla v18.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v21.4S, v11.s[1] -nop -mla v14.4S, v16.4S, v31.s[0] -nop -sqrdmulh v16.4S, v26.4S, v11.s[0] -nop -mla v13.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v25.4S, v11.s[0] -nop -ldr q1, [x17, #+64] -ldr q7, [x17, #+80] -mul v0.4S, v0.4S,v17.s[1] -sub v6.4s, v27.4s, v10.4s -mul v21.4S, v21.4S,v17.s[1] -add v27.4s, v27.4s, v10.4s -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v18.4s -mla v21.4S, v19.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mul v26.4S, v26.4S,v17.s[0] -sub v18.4s, v12.4s, v14.4s -mul v25.4S, v25.4S,v17.s[0] -add v12.4s, v12.4s, v14.4s -mla v26.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v13.4s -mla v25.4S, v3.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v6.4S, v9.s[3] -nop -mul v6.4S, v6.4S,v30.s[3] -nop -sqrdmulh v3.4S, v27.4S, v9.s[2] -sub v14.4s, v2.4s, v0.4s -mul v27.4S, v27.4S,v30.s[2] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v18.4S, v9.s[1] -sub v19.4s, v23.4s, v21.4s -mul v18.4S, v18.4S,v30.s[1] -add v23.4s, v23.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v9.s[0] -sub v10.4s, v24.4s, v26.4s -mul v12.4S, v12.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -mla v6.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v25.4s -sqrdmulh v26.4S, v14.4S, v7.s[3] -add v15.4s, v15.4s, v25.4s -mla v27.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v6.4s -sqrdmulh v25.4S, v2.4S, v7.s[2] -add v8.4s, v8.4s, v6.4s -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v27.4s -sqrdmulh v6.4S, v10.4S, v7.s[1] -add v20.4s, v20.4s, v27.4s -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v18.4s -sqrdmulh v27.4S, v24.4S, v7.s[0] -add v16.4s, v16.4s, v18.4s -mul v14.4S, v14.4S,v1.s[3] -sub v18.4s, v22.4s, v12.4s -mul v2.4S, v2.4S,v1.s[2] -add v22.4s, v22.4s, v12.4s -mla v14.4S, v26.4S, v31.s[0] -str q3, [x0, #992] -mla v2.4S, v25.4S, v31.s[0] -str q8, [x0, #928] -mul v10.4S, v10.4S,v1.s[1] -str q0, [x0, #864] -mul v24.4S, v24.4S,v1.s[0] -str q20, [x0, #800] -mla v10.4S, v6.4S, v31.s[0] -str q21, [x0, #736] -mla v24.4S, v27.4S, v31.s[0] -str q16, [x0, #672] -ldr q16, [x0, #1008] -sqrdmulh v27.4S, v16.4S, v28.s[0] -str q18, [x0, #608] -mul v16.4S, v16.4S,v29.s[0] -str q22, [x0, #544] -ldr q22, [x0, #944] -sqrdmulh v18.4S, v22.4S, v28.s[0] -sub v21.4s, v19.4s, v14.4s -str q21, [x0, #480] -mul v22.4S, v22.4S,v29.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #880] -sqrdmulh v21.4S, v14.4S, v28.s[0] -sub v6.4s, v23.4s, v2.4s -str q19, [x0, #416] -mul v14.4S, v14.4S,v29.s[0] -add v23.4s, v23.4s, v2.4s -ldr q2, [x0, #816] -sqrdmulh v19.4S, v2.4S, v28.s[0] -sub v20.4s, v13.4s, v10.4s -str q6, [x0, #352] -mul v2.4S, v2.4S,v29.s[0] -add v13.4s, v13.4s, v10.4s -ldr q10, [x0, #752] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v15.4s, v24.4s -str q23, [x0, #288] -sqrdmulh v23.4S, v10.4S, v28.s[0] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #688] -mla v22.4S, v18.4S, v31.s[0] -str q20, [x0, #224] -sqrdmulh v20.4S, v24.4S, v28.s[0] -nop -ldr q18, [x0, #624] -mla v14.4S, v21.4S, v31.s[0] -str q13, [x0, #160] -sqrdmulh v13.4S, v18.4S, v28.s[0] -nop -ldr q21, [x0, #560] -mla v2.4S, v19.4S, v31.s[0] -str q27, [x0, #96] -sqrdmulh v27.4S, v21.4S, v28.s[0] -nop -ldr q19, [x0, #496] -ldr q6, [x0, #432] -mul v10.4S, v10.4S,v29.s[0] -sub v0.4s, v19.4s, v16.4s -str q15, [x0, #32] -mul v24.4S, v24.4S,v29.s[0] -add v19.4s, v19.4s, v16.4s -ldr q16, [x0, #368] -ldr q15, [x0, #304] -mla v10.4S, v23.4S, v31.s[0] -sub v23.4s, v6.4s, v22.4s -mla v24.4S, v20.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -ldr q22, [x0, #240] -ldr q20, [x0, #176] -mul v18.4S, v18.4S,v29.s[0] -sub v8.4s, v16.4s, v14.4s -mul v21.4S, v21.4S,v29.s[0] -add v16.4s, v16.4s, v14.4s -ldr q14, [x0, #112] -ldr q25, [x0, #48] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v2.4s -mla v21.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v28.s[2] -nop -mul v0.4S, v0.4S,v29.s[2] -nop -sqrdmulh v27.4S, v23.4S, v28.s[2] -sub v3.4s, v22.4s, v10.4s -mul v23.4S, v23.4S,v29.s[2] -add v22.4s, v22.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v28.s[2] -sub v26.4s, v20.4s, v24.4s -mul v8.4S, v8.4S,v29.s[2] -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v13.4S, v28.s[2] -sub v12.4s, v14.4s, v18.4s -mul v13.4S, v13.4S,v29.s[2] -add v14.4s, v14.4s, v18.4s -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v21.4s -sqrdmulh v18.4S, v19.4S, v28.s[1] -add v25.4s, v25.4s, v21.4s -mla v23.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v6.4S, v28.s[1] -nop -mla v8.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v16.4S, v28.s[1] -nop -mla v13.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v15.4S, v28.s[1] -nop -mul v19.4S, v19.4S,v29.s[1] -sub v21.4s, v3.4s, v0.4s -mul v6.4S, v6.4S,v29.s[1] -add v3.4s, v3.4s, v0.4s -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v26.4s, v23.4s -mla v6.4S, v27.4S, v31.s[0] -add v26.4s, v26.4s, v23.4s -mul v16.4S, v16.4S,v29.s[1] -sub v23.4s, v12.4s, v8.4s -mul v15.4S, v15.4S,v29.s[1] -add v12.4s, v12.4s, v8.4s -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v13.4s -mla v15.4S, v24.4S, v31.s[0] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v11.s[3] -nop -mul v21.4S, v21.4S,v17.s[3] -nop -sqrdmulh v24.4S, v18.4S, v11.s[3] -sub v8.4s, v22.4s, v19.4s -mul v18.4S, v18.4S,v17.s[3] -add v22.4s, v22.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v11.s[2] -sub v27.4s, v20.4s, v6.4s -mul v3.4S, v3.4S,v17.s[2] -add v20.4s, v20.4s, v6.4s -sqrdmulh v6.4S, v26.4S, v11.s[2] -sub v0.4s, v14.4s, v16.4s -mul v26.4S, v26.4S,v17.s[2] -add v14.4s, v14.4s, v16.4s -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v25.4s, v15.4s -sqrdmulh v16.4S, v8.4S, v11.s[1] -add v25.4s, v25.4s, v15.4s -mla v18.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v27.4S, v11.s[1] -nop -mla v3.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v22.4S, v11.s[0] -nop -mla v26.4S, v6.4S, v31.s[0] -nop -sqrdmulh v6.4S, v20.4S, v11.s[0] -nop -mul v8.4S, v8.4S,v17.s[1] -sub v15.4s, v23.4s, v21.4s -mul v27.4S, v27.4S,v17.s[1] -add v23.4s, v23.4s, v21.4s -mla v8.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v18.4s -mla v27.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v18.4s -mul v22.4S, v22.4S,v17.s[0] -sub v18.4s, v12.4s, v3.4s -mul v20.4S, v20.4S,v17.s[0] -add v12.4s, v12.4s, v3.4s -mla v22.4S, v19.4S, v31.s[0] -sub v19.4s, v2.4s, v26.4s -mla v20.4S, v6.4S, v31.s[0] -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v15.4S, v9.s[3] -nop -mul v15.4S, v15.4S,v30.s[3] -nop -sqrdmulh v6.4S, v23.4S, v9.s[2] -sub v3.4s, v0.4s, v8.4s -mul v23.4S, v23.4S,v30.s[2] -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v9.s[1] -sub v24.4s, v13.4s, v27.4s -mul v18.4S, v18.4S,v30.s[1] -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v12.4S, v9.s[0] -sub v21.4s, v14.4s, v22.4s -mul v12.4S, v12.4S,v30.s[0] -add v14.4s, v14.4s, v22.4s -mla v15.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v20.4s -sqrdmulh v22.4S, v3.4S, v7.s[3] -add v25.4s, v25.4s, v20.4s -mla v23.4S, v6.4S, v31.s[0] -sub v6.4s, v16.4s, v15.4s -sqrdmulh v20.4S, v0.4S, v7.s[2] -add v16.4s, v16.4s, v15.4s -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v23.4s -sqrdmulh v15.4S, v21.4S, v7.s[1] -add v10.4s, v10.4s, v23.4s -mla v12.4S, v27.4S, v31.s[0] -sub v27.4s, v19.4s, v18.4s -sqrdmulh v23.4S, v14.4S, v7.s[0] -add v19.4s, v19.4s, v18.4s -mul v3.4S, v3.4S,v1.s[3] -sub v18.4s, v2.4s, v12.4s -mul v0.4S, v0.4S,v1.s[2] -add v2.4s, v2.4s, v12.4s -mla v3.4S, v22.4S, v31.s[0] -str q6, [x0, #1008] -mla v0.4S, v20.4S, v31.s[0] -str q16, [x0, #944] -mul v21.4S, v21.4S,v1.s[1] -str q8, [x0, #880] -mul v14.4S, v14.4S,v1.s[0] -str q10, [x0, #816] -mla v21.4S, v15.4S, v31.s[0] -str q27, [x0, #752] -mla v14.4S, v23.4S, v31.s[0] -str q19, [x0, #688] -ldr q19, [x0, #960] -sqrdmulh v23.4S, v19.4S, v28.s[0] -str q18, [x0, #624] -mul v19.4S, v19.4S,v29.s[0] -str q2, [x0, #560] -ldr q2, [x0, #896] -sqrdmulh v18.4S, v2.4S, v28.s[0] -sub v27.4s, v24.4s, v3.4s -str q27, [x0, #496] -mul v2.4S, v2.4S,v29.s[0] -add v24.4s, v24.4s, v3.4s -ldr q3, [x0, #832] -sqrdmulh v27.4S, v3.4S, v28.s[0] -sub v15.4s, v13.4s, v0.4s -str q24, [x0, #432] -mul v3.4S, v3.4S,v29.s[0] -add v13.4s, v13.4s, v0.4s -ldr q0, [x0, #768] -sqrdmulh v24.4S, v0.4S, v28.s[0] -sub v10.4s, v26.4s, v21.4s -str q15, [x0, #368] -mul v0.4S, v0.4S,v29.s[0] -add v26.4s, v26.4s, v21.4s -ldr q21, [x0, #704] -mla v19.4S, v23.4S, v31.s[0] -sub v23.4s, v25.4s, v14.4s -str q13, [x0, #304] -sqrdmulh v13.4S, v21.4S, v28.s[0] -add v25.4s, v25.4s, v14.4s -ldr q14, [x0, #640] -mla v2.4S, v18.4S, v31.s[0] -str q10, [x0, #240] -sqrdmulh v10.4S, v14.4S, v28.s[0] -nop -ldr q18, [x0, #576] -mla v3.4S, v27.4S, v31.s[0] -str q26, [x0, #176] -sqrdmulh v26.4S, v18.4S, v28.s[0] -nop -ldr q27, [x0, #512] -mla v0.4S, v24.4S, v31.s[0] -str q23, [x0, #112] -sqrdmulh v23.4S, v27.4S, v28.s[0] -nop -ldr q24, [x0, #448] -ldr q15, [x0, #384] -mul v21.4S, v21.4S,v29.s[0] -sub v8.4s, v24.4s, v19.4s -str q25, [x0, #48] -mul v14.4S, v14.4S,v29.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #320] -ldr q25, [x0, #256] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v2.4s -mla v14.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -ldr q2, [x0, #192] -ldr q10, [x0, #128] -mul v18.4S, v18.4S,v29.s[0] -sub v16.4s, v19.4s, v3.4s -mul v27.4S, v27.4S,v29.s[0] -add v19.4s, v19.4s, v3.4s -ldr q3, [x0, #64] -ldr q20, [x0, #0] -mla v18.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v0.4s -mla v27.4S, v23.4S, v31.s[0] -add v25.4s, v25.4s, v0.4s -sqrdmulh v0.4S, v8.4S, v28.s[2] -nop -mul v8.4S, v8.4S,v29.s[2] -nop -sqrdmulh v23.4S, v13.4S, v28.s[2] -sub v6.4s, v2.4s, v21.4s -mul v13.4S, v13.4S,v29.s[2] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v28.s[2] -sub v22.4s, v10.4s, v14.4s -mul v16.4S, v16.4S,v29.s[2] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v28.s[2] -sub v12.4s, v3.4s, v18.4s -mul v26.4S, v26.4S,v29.s[2] -add v3.4s, v3.4s, v18.4s -mla v8.4S, v0.4S, v31.s[0] -sub v0.4s, v20.4s, v27.4s -sqrdmulh v18.4S, v24.4S, v28.s[1] -add v20.4s, v20.4s, v27.4s -mla v13.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v15.4S, v28.s[1] -nop -mla v16.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v19.4S, v28.s[1] -nop -mla v26.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v25.4S, v28.s[1] -nop -mul v24.4S, v24.4S,v29.s[1] -sub v27.4s, v6.4s, v8.4s -mul v15.4S, v15.4S,v29.s[1] -add v6.4s, v6.4s, v8.4s -mla v24.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v13.4s -mla v15.4S, v23.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -mul v19.4S, v19.4S,v29.s[1] -sub v13.4s, v12.4s, v16.4s -mul v25.4S, v25.4S,v29.s[1] -add v12.4s, v12.4s, v16.4s -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v26.4s -mla v25.4S, v14.4S, v31.s[0] -add v0.4s, v0.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v11.s[3] -nop -mul v27.4S, v27.4S,v17.s[3] -nop -sqrdmulh v14.4S, v18.4S, v11.s[3] -sub v16.4s, v2.4s, v24.4s -mul v18.4S, v18.4S,v17.s[3] -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v6.4S, v11.s[2] -sub v23.4s, v10.4s, v15.4s -mul v6.4S, v6.4S,v17.s[2] -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v11.s[2] -sub v8.4s, v3.4s, v19.4s -mul v22.4S, v22.4S,v17.s[2] -add v3.4s, v3.4s, v19.4s -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v20.4s, v25.4s -sqrdmulh v19.4S, v16.4S, v11.s[1] -add v20.4s, v20.4s, v25.4s -mla v18.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v23.4S, v11.s[1] -nop -mla v6.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v2.4S, v11.s[0] -nop -mla v22.4S, v15.4S, v31.s[0] -nop -sqrdmulh v15.4S, v10.4S, v11.s[0] -nop -mul v16.4S, v16.4S,v17.s[1] -sub v25.4s, v13.4s, v27.4s -mul v23.4S, v23.4S,v17.s[1] -add v13.4s, v13.4s, v27.4s -mla v16.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v18.4s -mla v23.4S, v14.4S, v31.s[0] -add v21.4s, v21.4s, v18.4s -mul v2.4S, v2.4S,v17.s[0] -sub v18.4s, v12.4s, v6.4s -mul v10.4S, v10.4S,v17.s[0] -add v12.4s, v12.4s, v6.4s -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v0.4s, v22.4s -mla v10.4S, v15.4S, v31.s[0] -add v0.4s, v0.4s, v22.4s -sqrdmulh v22.4S, v25.4S, v9.s[3] -nop -mul v25.4S, v25.4S,v30.s[3] -nop -sqrdmulh v15.4S, v13.4S, v9.s[2] -sub v6.4s, v8.4s, v16.4s -mul v13.4S, v13.4S,v30.s[2] -add v8.4s, v8.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v9.s[1] -sub v14.4s, v26.4s, v23.4s -mul v18.4S, v18.4S,v30.s[1] -add v26.4s, v26.4s, v23.4s -sqrdmulh v23.4S, v12.4S, v9.s[0] -sub v27.4s, v3.4s, v2.4s -mul v12.4S, v12.4S,v30.s[0] -add v3.4s, v3.4s, v2.4s -mla v25.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v10.4s -sqrdmulh v2.4S, v6.4S, v7.s[3] -add v20.4s, v20.4s, v10.4s -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v25.4s -sqrdmulh v10.4S, v8.4S, v7.s[2] -add v19.4s, v19.4s, v25.4s -mla v18.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v13.4s -sqrdmulh v25.4S, v27.4S, v7.s[1] -add v21.4s, v21.4s, v13.4s -mla v12.4S, v23.4S, v31.s[0] -sub v23.4s, v24.4s, v18.4s -sqrdmulh v13.4S, v3.4S, v7.s[0] -add v24.4s, v24.4s, v18.4s -mul v6.4S, v6.4S,v1.s[3] -sub v18.4s, v0.4s, v12.4s -mul v8.4S, v8.4S,v1.s[2] -add v0.4s, v0.4s, v12.4s -mla v6.4S, v2.4S, v31.s[0] -str q15, [x0, #960] -mla v8.4S, v10.4S, v31.s[0] -str q19, [x0, #896] -mul v27.4S, v27.4S,v1.s[1] -str q16, [x0, #832] -mul v3.4S, v3.4S,v1.s[0] -str q21, [x0, #768] -mla v27.4S, v25.4S, v31.s[0] -str q23, [x0, #704] -mla v3.4S, v13.4S, v31.s[0] -str q24, [x0, #640] -ldr q24, [x0, #976] -sqrdmulh v13.4S, v24.4S, v28.s[0] -str q18, [x0, #576] -mul v24.4S, v24.4S,v29.s[0] -str q0, [x0, #512] -ldr q0, [x0, #912] -sqrdmulh v18.4S, v0.4S, v28.s[0] -sub v23.4s, v14.4s, v6.4s -str q23, [x0, #448] -mul v0.4S, v0.4S,v29.s[0] -add v14.4s, v14.4s, v6.4s -ldr q6, [x0, #848] -sqrdmulh v23.4S, v6.4S, v28.s[0] -sub v25.4s, v26.4s, v8.4s -str q14, [x0, #384] -mul v6.4S, v6.4S,v29.s[0] -add v26.4s, v26.4s, v8.4s -ldr q8, [x0, #784] -sqrdmulh v14.4S, v8.4S, v28.s[0] -sub v21.4s, v22.4s, v27.4s -str q25, [x0, #320] -mul v8.4S, v8.4S,v29.s[0] -add v22.4s, v22.4s, v27.4s -ldr q27, [x0, #720] -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v3.4s -str q26, [x0, #256] -sqrdmulh v26.4S, v27.4S, v28.s[0] -add v20.4s, v20.4s, v3.4s -ldr q3, [x0, #656] -mla v0.4S, v18.4S, v31.s[0] -str q21, [x0, #192] -sqrdmulh v21.4S, v3.4S, v28.s[0] -nop -ldr q18, [x0, #592] -mla v6.4S, v23.4S, v31.s[0] -str q22, [x0, #128] -sqrdmulh v22.4S, v18.4S, v28.s[0] -nop -ldr q23, [x0, #528] -mla v8.4S, v14.4S, v31.s[0] -str q13, [x0, #64] -sqrdmulh v13.4S, v23.4S, v28.s[0] -nop -ldr q14, [x0, #464] -ldr q25, [x0, #400] -mul v27.4S, v27.4S,v29.s[0] -sub v16.4s, v14.4s, v24.4s -str q20, [x0, #0] -mul v3.4S, v3.4S,v29.s[0] -add v14.4s, v14.4s, v24.4s -ldr q24, [x0, #336] -ldr q20, [x0, #272] -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v0.4s -mla v3.4S, v21.4S, v31.s[0] -add v25.4s, v25.4s, v0.4s -ldr q0, [x0, #208] -ldr q21, [x0, #144] -mul v18.4S, v18.4S,v29.s[0] -sub v19.4s, v24.4s, v6.4s -mul v23.4S, v23.4S,v29.s[0] -add v24.4s, v24.4s, v6.4s -ldr q6, [x0, #80] -ldr q10, [x0, #16] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v20.4s, v8.4s -mla v23.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v28.s[2] -nop -mul v16.4S, v16.4S,v29.s[2] -nop -sqrdmulh v13.4S, v26.4S, v28.s[2] -sub v15.4s, v0.4s, v27.4s -mul v26.4S, v26.4S,v29.s[2] -add v0.4s, v0.4s, v27.4s -sqrdmulh v27.4S, v19.4S, v28.s[2] -sub v2.4s, v21.4s, v3.4s -mul v19.4S, v19.4S,v29.s[2] -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v28.s[2] -sub v12.4s, v6.4s, v18.4s -mul v22.4S, v22.4S,v29.s[2] -add v6.4s, v6.4s, v18.4s -mla v16.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v23.4s -sqrdmulh v18.4S, v14.4S, v28.s[1] -add v10.4s, v10.4s, v23.4s -mla v26.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v25.4S, v28.s[1] -nop -mla v19.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v24.4S, v28.s[1] -nop -mla v22.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v20.4S, v28.s[1] -nop -mul v14.4S, v14.4S,v29.s[1] -sub v23.4s, v15.4s, v16.4s -mul v25.4S, v25.4S,v29.s[1] -add v15.4s, v15.4s, v16.4s -mla v14.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v26.4s -mla v25.4S, v13.4S, v31.s[0] -add v2.4s, v2.4s, v26.4s -mul v24.4S, v24.4S,v29.s[1] -sub v26.4s, v12.4s, v19.4s -mul v20.4S, v20.4S,v29.s[1] -add v12.4s, v12.4s, v19.4s -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v8.4s, v22.4s -mla v20.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v22.4s -sqrdmulh v28.4S, v23.4S, v11.s[3] -nop -mul v23.4S, v23.4S,v17.s[3] -nop -sqrdmulh v29.4S, v18.4S, v11.s[3] -sub v22.4s, v0.4s, v14.4s -mul v18.4S, v18.4S,v17.s[3] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v11.s[2] -sub v3.4s, v21.4s, v25.4s -mul v15.4S, v15.4S,v17.s[2] -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v11.s[2] -sub v19.4s, v6.4s, v24.4s -mul v2.4S, v2.4S,v17.s[2] -add v6.4s, v6.4s, v24.4s -mla v23.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v20.4s -sqrdmulh v24.4S, v22.4S, v11.s[1] -add v10.4s, v10.4s, v20.4s -mla v18.4S, v29.4S, v31.s[0] -nop -sqrdmulh v29.4S, v3.4S, v11.s[1] -nop -mla v15.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v0.4S, v11.s[0] -nop -mla v2.4S, v25.4S, v31.s[0] -nop -sqrdmulh v25.4S, v21.4S, v11.s[0] -nop -mul v22.4S, v22.4S,v17.s[1] -sub v20.4s, v26.4s, v23.4s -mul v3.4S, v3.4S,v17.s[1] -add v26.4s, v26.4s, v23.4s -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v18.4s -mla v3.4S, v29.4S, v31.s[0] -add v27.4s, v27.4s, v18.4s -mul v0.4S, v0.4S,v17.s[0] -sub v18.4s, v12.4s, v15.4s -mul v21.4S, v21.4S,v17.s[0] -add v12.4s, v12.4s, v15.4s -mla v0.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v2.4s -mla v21.4S, v25.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v11.4S, v20.4S, v9.s[3] -nop -mul v20.4S, v20.4S,v30.s[3] -nop -sqrdmulh v17.4S, v26.4S, v9.s[2] -sub v2.4s, v19.4s, v22.4s -mul v26.4S, v26.4S,v30.s[2] -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v9.s[1] -sub v25.4s, v28.4s, v3.4s -mul v18.4S, v18.4S,v30.s[1] -add v28.4s, v28.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v9.s[0] -sub v15.4s, v6.4s, v0.4s -mul v12.4S, v12.4S,v30.s[0] -add v6.4s, v6.4s, v0.4s -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v21.4s -sqrdmulh v9.4S, v2.4S, v7.s[3] -add v10.4s, v10.4s, v21.4s -mla v26.4S, v17.4S, v31.s[0] -sub v17.4s, v24.4s, v20.4s -sqrdmulh v21.4S, v19.4S, v7.s[2] -add v24.4s, v24.4s, v20.4s -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v27.4s, v26.4s -sqrdmulh v20.4S, v15.4S, v7.s[1] -add v27.4s, v27.4s, v26.4s -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v14.4s, v18.4s -sqrdmulh v26.4S, v6.4S, v7.s[0] -add v14.4s, v14.4s, v18.4s -mul v2.4S, v2.4S,v1.s[3] -sub v18.4s, v8.4s, v12.4s -mul v19.4S, v19.4S,v1.s[2] -add v8.4s, v8.4s, v12.4s -mla v2.4S, v9.4S, v31.s[0] -str q17, [x0, #976] -mla v19.4S, v21.4S, v31.s[0] -str q24, [x0, #912] -mul v15.4S, v15.4S,v1.s[1] -str q22, [x0, #848] -mul v6.4S, v6.4S,v1.s[0] -str q27, [x0, #784] -mla v15.4S, v20.4S, v31.s[0] -str q3, [x0, #720] -mla v6.4S, v26.4S, v31.s[0] -str q14, [x0, #656] -str q18, [x0, #592] -str q8, [x0, #528] -sub v8.4s, v25.4s, v2.4s -str q8, [x0, #464] -add v25.4s, v25.4s, v2.4s -sub v2.4s, v28.4s, v19.4s -str q25, [x0, #400] -add v28.4s, v28.4s, v19.4s -sub v19.4s, v11.4s, v15.4s -str q2, [x0, #336] -add v11.4s, v11.4s, v15.4s -sub v15.4s, v10.4s, v6.4s -str q28, [x0, #272] -add v10.4s, v10.4s, v6.4s -str q19, [x0, #208] -str q11, [x0, #144] -str q15, [x0, #80] -str q10, [x0, #16] -ldr q4, [x0, #224] -ldr q5, [x0, #160] -ldr q16, [x0, #32] -ldr q13, [x17, #+128] -ldr q23, [x17, #+144] -sqrdmulh v29.4S, v16.4S, v23.s[0] -mul v16.4S, v16.4S,v13.s[0] -ldr q0, [x0, #48] -sqrdmulh v30.4S, v0.4S, v23.s[0] -mul v0.4S, v0.4S,v13.s[0] -ldr q12, [x17, #+160] -ldr q9, [x17, #+176] -ldr q17, [x0, #96] -sqrdmulh v21.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v12.s[0] -ldr q24, [x0, #112] -sqrdmulh v22.4S, v24.4S, v9.s[0] -mul v24.4S, v24.4S,v12.s[0] -ldr q27, [x17, #+192] -ldr q20, [x17, #+208] -mla v16.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v5.4S, v20.s[0] -ldr q3, [x0, #176] -mla v0.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v3.4S, v20.s[0] -ldr q26, [x17, #+224] -ldr q14, [x17, #+240] -mla v17.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v4.4S, v14.s[0] -ldr q1, [x0, #240] -mla v24.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v1.4S, v14.s[0] -ldr q7, [x0, #0] -ldr q18, [x0, #128] -mul v5.4S, v5.4S,v27.s[0] -sub v8.4s, v7.4s, v16.4s -ldr q25, [x0, #16] -mul v3.4S, v3.4S,v27.s[0] -add v7.4s, v7.4s, v16.4s -ldr q16, [x0, #144] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v0.4s -ldr q2, [x0, #64] -mla v3.4S, v30.4S, v31.s[0] -add v25.4s, v25.4s, v0.4s -ldr q0, [x0, #192] -mul v4.4S, v4.4S,v26.s[0] -sub v30.4s, v2.4s, v17.4s -ldr q28, [x0, #80] -mul v1.4S, v1.4S,v26.s[0] -add v2.4s, v2.4s, v17.4s -ldr q17, [x0, #208] -mla v4.4S, v21.4S, v31.s[0] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v28.4s, v24.4s -sqrdmulh v21.4S, v25.4S, v23.s[1] -add v28.4s, v28.4s, v24.4s -mul v25.4S, v25.4S,v13.s[1] -sqrdmulh v24.4S, v29.4S, v23.s[2] -sub v6.4s, v18.4s, v5.4s -mul v29.4S, v29.4S,v13.s[2] -add v18.4s, v18.4s, v5.4s -sqrdmulh v23.4S, v28.4S, v9.s[1] -sub v13.4s, v16.4s, v3.4s -mul v28.4S, v28.4S,v12.s[1] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v9.s[2] -sub v5.4s, v0.4s, v4.4s -mul v22.4S, v22.4S,v12.s[2] -add v0.4s, v0.4s, v4.4s -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v1.4s -ldr q9, [x0, #480] -sqrdmulh v12.4S, v16.4S, v20.s[1] -add v17.4s, v17.4s, v1.4s -mla v29.4S, v24.4S, v31.s[0] -ldr q24, [x0, #416] -sqrdmulh v1.4S, v13.4S, v20.s[2] -sub v4.4s, v7.4s, v25.4s -mla v28.4S, v23.4S, v31.s[0] -ldr q23, [x0, #288] -sqrdmulh v19.4S, v17.4S, v14.s[1] -add v7.4s, v7.4s, v25.4s -str q4, [x0, #16] -mla v22.4S, v3.4S, v31.s[0] -ldr q3, [x17, #+256] -ldr q4, [x17, #+272] -sqrdmulh v25.4S, v21.4S, v14.s[2] -sub v11.4s, v8.4s, v29.4s -str q7, [x0, #0] -mul v16.4S, v16.4S,v27.s[1] -add v8.4s, v8.4s, v29.4s -mul v13.4S, v13.4S,v27.s[2] -str q11, [x0, #48] -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v2.4s, v28.4s -mla v13.4S, v1.4S, v31.s[0] -str q8, [x0, #32] -mul v17.4S, v17.4S,v26.s[1] -str q12, [x0, #80] -mul v21.4S, v21.4S,v26.s[2] -add v2.4s, v2.4s, v28.4s -str q2, [x0, #64] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v30.4s, v22.4s -str q19, [x0, #112] -mla v21.4S, v25.4S, v31.s[0] -add v30.4s, v30.4s, v22.4s -str q30, [x0, #96] -sqrdmulh v14.4S, v23.4S, v4.s[0] -sub v26.4s, v18.4s, v16.4s -mul v23.4S, v23.4S,v3.s[0] -str q26, [x0, #144] -ldr q26, [x0, #304] -sqrdmulh v30.4S, v26.4S, v4.s[0] -add v18.4s, v18.4s, v16.4s -mul v26.4S, v26.4S,v3.s[0] -str q18, [x0, #128] -ldr q18, [x17, #+288] -ldr q16, [x17, #+304] -ldr q22, [x0, #352] -sqrdmulh v25.4S, v22.4S, v16.s[0] -sub v19.4s, v6.4s, v13.4s -mul v22.4S, v22.4S,v18.s[0] -str q19, [x0, #176] -ldr q19, [x0, #368] -sqrdmulh v2.4S, v19.4S, v16.s[0] -add v6.4s, v6.4s, v13.4s -mul v19.4S, v19.4S,v18.s[0] -str q6, [x0, #160] -ldr q6, [x17, #+320] -ldr q13, [x17, #+336] -mla v23.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v17.4s -sqrdmulh v28.4S, v24.4S, v13.s[0] -str q14, [x0, #208] -ldr q14, [x0, #432] -mla v26.4S, v30.4S, v31.s[0] -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v13.s[0] -str q0, [x0, #192] -ldr q0, [x17, #+352] -ldr q30, [x17, #+368] -mla v22.4S, v25.4S, v31.s[0] -sub v25.4s, v5.4s, v21.4s -sqrdmulh v12.4S, v9.4S, v30.s[0] -str q25, [x0, #240] -ldr q25, [x0, #496] -mla v19.4S, v2.4S, v31.s[0] -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v25.4S, v30.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q2, [x0, #384] -mul v24.4S, v24.4S,v6.s[0] -sub v20.4s, v5.4s, v23.4s -ldr q27, [x0, #272] -mul v14.4S, v14.4S,v6.s[0] -add v5.4s, v5.4s, v23.4s -ldr q23, [x0, #400] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v26.4s -ldr q8, [x0, #320] -mla v14.4S, v17.4S, v31.s[0] -add v27.4s, v27.4s, v26.4s -ldr q26, [x0, #448] -mul v9.4S, v9.4S,v0.s[0] -sub v17.4s, v8.4s, v22.4s -ldr q1, [x0, #336] -mul v25.4S, v25.4S,v0.s[0] -add v8.4s, v8.4s, v22.4s -ldr q22, [x0, #464] -mla v9.4S, v12.4S, v31.s[0] -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v19.4s -sqrdmulh v12.4S, v27.4S, v4.s[1] -add v1.4s, v1.4s, v19.4s -mul v27.4S, v27.4S,v3.s[1] -sqrdmulh v19.4S, v28.4S, v4.s[2] -sub v11.4s, v2.4s, v24.4s -mul v28.4S, v28.4S,v3.s[2] -add v2.4s, v2.4s, v24.4s -sqrdmulh v4.4S, v1.4S, v16.s[1] -sub v3.4s, v23.4s, v14.4s -mul v1.4S, v1.4S,v18.s[1] -add v23.4s, v23.4s, v14.4s -sqrdmulh v14.4S, v21.4S, v16.s[2] -sub v24.4s, v26.4s, v9.4s -mul v21.4S, v21.4S,v18.s[2] -add v26.4s, v26.4s, v9.4s -mla v27.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v25.4s -ldr q16, [x0, #736] -sqrdmulh v18.4S, v23.4S, v13.s[1] -add v22.4s, v22.4s, v25.4s -mla v28.4S, v19.4S, v31.s[0] -ldr q19, [x0, #672] -sqrdmulh v25.4S, v3.4S, v13.s[2] -sub v9.4s, v5.4s, v27.4s -mla v1.4S, v4.4S, v31.s[0] -ldr q4, [x0, #544] -sqrdmulh v29.4S, v22.4S, v30.s[1] -add v5.4s, v5.4s, v27.4s -str q9, [x0, #272] -mla v21.4S, v14.4S, v31.s[0] -ldr q14, [x17, #+384] -ldr q9, [x17, #+400] -sqrdmulh v27.4S, v12.4S, v30.s[2] -sub v7.4s, v20.4s, v28.4s -str q5, [x0, #256] -mul v23.4S, v23.4S,v6.s[1] -add v20.4s, v20.4s, v28.4s -mul v3.4S, v3.4S,v6.s[2] -str q7, [x0, #304] -mla v23.4S, v18.4S, v31.s[0] -sub v18.4s, v8.4s, v1.4s -mla v3.4S, v25.4S, v31.s[0] -str q20, [x0, #288] -mul v22.4S, v22.4S,v0.s[1] -str q18, [x0, #336] -mul v12.4S, v12.4S,v0.s[2] -add v8.4s, v8.4s, v1.4s -str q8, [x0, #320] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v17.4s, v21.4s -str q29, [x0, #368] -mla v12.4S, v27.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -str q17, [x0, #352] -sqrdmulh v30.4S, v4.4S, v9.s[0] -sub v0.4s, v2.4s, v23.4s -mul v4.4S, v4.4S,v14.s[0] -str q0, [x0, #400] -ldr q0, [x0, #560] -sqrdmulh v17.4S, v0.4S, v9.s[0] -add v2.4s, v2.4s, v23.4s -mul v0.4S, v0.4S,v14.s[0] -str q2, [x0, #384] -ldr q2, [x17, #+416] -ldr q23, [x17, #+432] -ldr q21, [x0, #608] -sqrdmulh v27.4S, v21.4S, v23.s[0] -sub v29.4s, v11.4s, v3.4s -mul v21.4S, v21.4S,v2.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v8.4S, v29.4S, v23.s[0] -add v11.4s, v11.4s, v3.4s -mul v29.4S, v29.4S,v2.s[0] -str q11, [x0, #416] -ldr q11, [x17, #+448] -ldr q3, [x17, #+464] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v26.4s, v22.4s -sqrdmulh v1.4S, v19.4S, v3.s[0] -str q30, [x0, #464] -ldr q30, [x0, #688] -mla v0.4S, v17.4S, v31.s[0] -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v3.s[0] -str q26, [x0, #448] -ldr q26, [x17, #+480] -ldr q17, [x17, #+496] -mla v21.4S, v27.4S, v31.s[0] -sub v27.4s, v24.4s, v12.4s -sqrdmulh v18.4S, v16.4S, v17.s[0] -str q27, [x0, #496] -ldr q27, [x0, #752] -mla v29.4S, v8.4S, v31.s[0] -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v17.s[0] -str q24, [x0, #480] -ldr q24, [x0, #512] -ldr q8, [x0, #640] -mul v19.4S, v19.4S,v11.s[0] -sub v13.4s, v24.4s, v4.4s -ldr q6, [x0, #528] -mul v30.4S, v30.4S,v11.s[0] -add v24.4s, v24.4s, v4.4s -ldr q4, [x0, #656] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v0.4s -ldr q20, [x0, #576] -mla v30.4S, v22.4S, v31.s[0] -add v6.4s, v6.4s, v0.4s -ldr q0, [x0, #704] -mul v16.4S, v16.4S,v26.s[0] -sub v22.4s, v20.4s, v21.4s -ldr q25, [x0, #592] -mul v27.4S, v27.4S,v26.s[0] -add v20.4s, v20.4s, v21.4s -ldr q21, [x0, #720] -mla v16.4S, v18.4S, v31.s[0] -mla v27.4S, v12.4S, v31.s[0] -sub v12.4s, v25.4s, v29.4s -sqrdmulh v18.4S, v6.4S, v9.s[1] -add v25.4s, v25.4s, v29.4s -mul v6.4S, v6.4S,v14.s[1] -sqrdmulh v29.4S, v1.4S, v9.s[2] -sub v7.4s, v8.4s, v19.4s -mul v1.4S, v1.4S,v14.s[2] -add v8.4s, v8.4s, v19.4s -sqrdmulh v9.4S, v25.4S, v23.s[1] -sub v14.4s, v4.4s, v30.4s -mul v25.4S, v25.4S,v2.s[1] -add v4.4s, v4.4s, v30.4s -sqrdmulh v30.4S, v12.4S, v23.s[2] -sub v19.4s, v0.4s, v16.4s -mul v12.4S, v12.4S,v2.s[2] -add v0.4s, v0.4s, v16.4s -mla v6.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v27.4s -ldr q23, [x0, #992] -sqrdmulh v2.4S, v4.4S, v3.s[1] -add v21.4s, v21.4s, v27.4s -mla v1.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v27.4S, v14.4S, v3.s[2] -sub v16.4s, v24.4s, v6.4s -mla v25.4S, v9.4S, v31.s[0] -ldr q9, [x0, #800] -sqrdmulh v28.4S, v21.4S, v17.s[1] -add v24.4s, v24.4s, v6.4s -str q16, [x0, #528] -mla v12.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+512] -ldr q16, [x17, #+528] -sqrdmulh v6.4S, v18.4S, v17.s[2] -sub v5.4s, v13.4s, v1.4s -str q24, [x0, #512] -mul v4.4S, v4.4S,v11.s[1] -add v13.4s, v13.4s, v1.4s -mul v14.4S, v14.4S,v11.s[2] -str q5, [x0, #560] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v25.4s -mla v14.4S, v27.4S, v31.s[0] -str q13, [x0, #544] -mul v21.4S, v21.4S,v26.s[1] -str q2, [x0, #592] -mul v18.4S, v18.4S,v26.s[2] -add v20.4s, v20.4s, v25.4s -str q20, [x0, #576] -mla v21.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v12.4s -str q28, [x0, #624] -mla v18.4S, v6.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -str q22, [x0, #608] -sqrdmulh v17.4S, v9.4S, v16.s[0] -sub v26.4s, v8.4s, v4.4s -mul v9.4S, v9.4S,v30.s[0] -str q26, [x0, #656] -ldr q26, [x0, #816] -sqrdmulh v22.4S, v26.4S, v16.s[0] -add v8.4s, v8.4s, v4.4s -mul v26.4S, v26.4S,v30.s[0] -str q8, [x0, #640] -ldr q8, [x17, #+544] -ldr q4, [x17, #+560] -ldr q12, [x0, #864] -sqrdmulh v6.4S, v12.4S, v4.s[0] -sub v28.4s, v7.4s, v14.4s -mul v12.4S, v12.4S,v8.s[0] -str q28, [x0, #688] -ldr q28, [x0, #880] -sqrdmulh v20.4S, v28.4S, v4.s[0] -add v7.4s, v7.4s, v14.4s -mul v28.4S, v28.4S,v8.s[0] -str q7, [x0, #672] -ldr q7, [x17, #+576] -ldr q14, [x17, #+592] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v21.4s -sqrdmulh v25.4S, v29.4S, v14.s[0] -str q17, [x0, #720] -ldr q17, [x0, #944] -mla v26.4S, v22.4S, v31.s[0] -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v14.s[0] -str q0, [x0, #704] -ldr q0, [x17, #+608] -ldr q22, [x17, #+624] -mla v12.4S, v6.4S, v31.s[0] -sub v6.4s, v19.4s, v18.4s -sqrdmulh v2.4S, v23.4S, v22.s[0] -str q6, [x0, #752] -ldr q6, [x0, #1008] -mla v28.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v6.4S, v22.s[0] -str q19, [x0, #736] -ldr q19, [x0, #768] -ldr q20, [x0, #896] -mul v29.4S, v29.4S,v7.s[0] -sub v3.4s, v19.4s, v9.4s -ldr q11, [x0, #784] -mul v17.4S, v17.4S,v7.s[0] -add v19.4s, v19.4s, v9.4s -ldr q9, [x0, #912] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v11.4s, v26.4s -ldr q13, [x0, #832] -mla v17.4S, v21.4S, v31.s[0] -add v11.4s, v11.4s, v26.4s -ldr q26, [x0, #960] -mul v23.4S, v23.4S,v0.s[0] -sub v21.4s, v13.4s, v12.4s -ldr q27, [x0, #848] -mul v6.4S, v6.4S,v0.s[0] -add v13.4s, v13.4s, v12.4s -ldr q12, [x0, #976] -mla v23.4S, v2.4S, v31.s[0] -mla v6.4S, v18.4S, v31.s[0] -sub v18.4s, v27.4s, v28.4s -sqrdmulh v2.4S, v11.4S, v16.s[1] -add v27.4s, v27.4s, v28.4s -mul v11.4S, v11.4S,v30.s[1] -sqrdmulh v28.4S, v25.4S, v16.s[2] -sub v5.4s, v20.4s, v29.4s -mul v25.4S, v25.4S,v30.s[2] -add v20.4s, v20.4s, v29.4s -sqrdmulh v16.4S, v27.4S, v4.s[1] -sub v30.4s, v9.4s, v17.4s -mul v27.4S, v27.4S,v8.s[1] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v4.s[2] -sub v29.4s, v26.4s, v23.4s -mul v18.4S, v18.4S,v8.s[2] -add v26.4s, v26.4s, v23.4s -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v12.4s, v6.4s -sqrdmulh v4.4S, v9.4S, v14.s[1] -add v12.4s, v12.4s, v6.4s -mla v25.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v30.4S, v14.s[2] -sub v6.4s, v19.4s, v11.4s -mla v27.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v12.4S, v22.s[1] -add v19.4s, v19.4s, v11.4s -str q6, [x0, #784] -mla v18.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v2.4S, v22.s[2] -sub v6.4s, v3.4s, v25.4s -str q19, [x0, #768] -mul v9.4S, v9.4S,v7.s[1] -add v3.4s, v3.4s, v25.4s -mul v30.4S, v30.4S,v7.s[2] -str q6, [x0, #816] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v27.4s -mla v30.4S, v28.4S, v31.s[0] -str q3, [x0, #800] -mul v12.4S, v12.4S,v0.s[1] -str q4, [x0, #848] -mul v2.4S, v2.4S,v0.s[2] -add v13.4s, v13.4s, v27.4s -str q13, [x0, #832] -mla v12.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v18.4s -str q16, [x0, #880] -mla v2.4S, v17.4S, v31.s[0] -add v21.4s, v21.4s, v18.4s -str q21, [x0, #864] -sub v22.4s, v20.4s, v9.4s -str q22, [x0, #912] -add v20.4s, v20.4s, v9.4s -str q20, [x0, #896] -sub v20.4s, v5.4s, v30.4s -str q20, [x0, #944] -add v5.4s, v5.4s, v30.4s -str q5, [x0, #928] -sub v5.4s, v26.4s, v12.4s -str q5, [x0, #976] -add v26.4s, v26.4s, v12.4s -str q26, [x0, #960] -sub v26.4s, v29.4s, v2.4s -str q26, [x0, #1008] -add v29.4s, v29.4s, v2.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1548 -// Instruction count: 1544 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s deleted file mode 100644 index d7db1d0..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_16_z4_7.s +++ /dev/null @@ -1,1578 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_16_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x0, #992] -ldr q29, [x17, #+0] -ldr q28, [x17, #+16] -sqrdmulh v27.4S, v30.4S, v28.s[0] -mul v30.4S, v30.4S,v29.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v28.s[0] -mul v26.4S, v26.4S,v29.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v28.s[0] -mul v24.4S, v24.4S,v29.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v28.s[0] -mul v22.4S, v22.4S,v29.s[0] -ldr q20, [x0, #736] -mla v30.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v28.s[0] -ldr q19, [x0, #672] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v28.s[0] -nop -ldr q18, [x0, #608] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v28.s[0] -nop -ldr q17, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v17.4S, v28.s[0] -ldr q16, [x0, #480] -ldr q3, [x0, #416] -mul v20.4S, v20.4S,v29.s[0] -sub v2.4s, v16.4s, v30.4s -mul v19.4S, v19.4S,v29.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #352] -ldr q1, [x0, #288] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q25, [x0, #160] -mul v18.4S, v18.4S,v29.s[0] -sub v0.4s, v30.4s, v24.4s -mul v17.4S, v17.4S,v29.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v28.s[2] -nop -mul v2.4S, v2.4S,v29.s[2] -nop -sqrdmulh v21.4S, v27.4S, v28.s[2] -sub v14.4s, v26.4s, v20.4s -mul v27.4S, v27.4S,v29.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v28.s[1] -sub v13.4s, v25.4s, v19.4s -mul v16.4S, v16.4S,v29.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v28.s[1] -sub v12.4s, v24.4s, v18.4s -mul v3.4S, v3.4S,v29.s[1] -add v24.4s, v24.4s, v18.4s -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v0.4S, v28.s[2] -add v15.4s, v15.4s, v17.4s -mla v27.4S, v21.4S, v31.s[0] -nop -sqrdmulh v21.4S, v23.4S, v28.s[2] -nop -mla v16.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v30.4S, v28.s[1] -nop -mla v3.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v1.4S, v28.s[1] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v0.4S, v0.4S,v29.s[2] -sub v10.4s, v14.4s, v2.4s -mul v23.4S, v23.4S,v29.s[2] -add v14.4s, v14.4s, v2.4s -mla v0.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v27.4s -mla v23.4S, v21.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -mul v30.4S, v30.4S,v29.s[1] -sub v27.4s, v26.4s, v16.4s -mul v1.4S, v1.4S,v29.s[1] -add v26.4s, v26.4s, v16.4s -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v3.4s -mla v1.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v11.s[3] -nop -mul v10.4S, v10.4S,v17.s[3] -nop -sqrdmulh v19.4S, v14.4S, v11.s[2] -sub v16.4s, v12.4s, v0.4s -mul v14.4S, v14.4S,v17.s[2] -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v11.s[1] -sub v21.4s, v22.4s, v23.4s -mul v27.4S, v27.4S,v17.s[1] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v11.s[0] -sub v2.4s, v24.4s, v30.4s -mul v26.4S, v26.4S,v17.s[0] -add v24.4s, v24.4s, v30.4s -ldr q30, [x17, #+96] -ldr q9, [x17, #+112] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v1.4s -sqrdmulh v8.4S, v18.4S, v11.s[3] -add v15.4s, v15.4s, v1.4s -mla v14.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v13.4S, v11.s[2] -nop -mla v27.4S, v0.4S, v31.s[0] -nop -sqrdmulh v0.4S, v20.4S, v11.s[1] -nop -mla v26.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v25.4S, v11.s[0] -nop -ldr q1, [x17, #+64] -ldr q7, [x17, #+80] -mul v18.4S, v18.4S,v17.s[3] -sub v6.4s, v16.4s, v10.4s -mul v13.4S, v13.4S,v17.s[2] -add v16.4s, v16.4s, v10.4s -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v14.4s -mla v13.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v14.4s -mul v20.4S, v20.4S,v17.s[1] -sub v14.4s, v2.4s, v27.4s -mul v25.4S, v25.4S,v17.s[0] -add v2.4s, v2.4s, v27.4s -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v26.4s -mla v25.4S, v23.4S, v31.s[0] -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[3] -nop -mul v6.4S, v6.4S,v30.s[3] -nop -sqrdmulh v23.4S, v16.4S, v9.s[2] -sub v27.4s, v21.4s, v18.4s -mul v16.4S, v16.4S,v30.s[2] -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v9.s[1] -sub v19.4s, v22.4s, v13.4s -mul v8.4S, v8.4S,v30.s[1] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v9.s[0] -sub v10.4s, v3.4s, v20.4s -mul v12.4S, v12.4S,v30.s[0] -add v3.4s, v3.4s, v20.4s -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v25.4s -sqrdmulh v20.4S, v14.4S, v7.s[3] -add v15.4s, v15.4s, v25.4s -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v27.4s, v6.4s -sqrdmulh v25.4S, v2.4S, v7.s[2] -add v27.4s, v27.4s, v6.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v16.4s -sqrdmulh v6.4S, v0.4S, v7.s[1] -add v21.4s, v21.4s, v16.4s -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v19.4s, v8.4s -sqrdmulh v16.4S, v24.4S, v7.s[0] -add v19.4s, v19.4s, v8.4s -mul v14.4S, v14.4S,v1.s[3] -sub v8.4s, v22.4s, v12.4s -mul v2.4S, v2.4S,v1.s[2] -add v22.4s, v22.4s, v12.4s -mla v14.4S, v20.4S, v31.s[0] -str q23, [x0, #992] -mla v2.4S, v25.4S, v31.s[0] -str q27, [x0, #928] -mul v0.4S, v0.4S,v1.s[1] -str q18, [x0, #864] -mul v24.4S, v24.4S,v1.s[0] -str q21, [x0, #800] -mla v0.4S, v6.4S, v31.s[0] -str q13, [x0, #736] -mla v24.4S, v16.4S, v31.s[0] -str q19, [x0, #672] -ldr q19, [x0, #1008] -sqrdmulh v16.4S, v19.4S, v28.s[0] -str q8, [x0, #608] -mul v19.4S, v19.4S,v29.s[0] -str q22, [x0, #544] -ldr q22, [x0, #944] -sqrdmulh v8.4S, v22.4S, v28.s[0] -sub v13.4s, v10.4s, v14.4s -str q13, [x0, #480] -mul v22.4S, v22.4S,v29.s[0] -add v10.4s, v10.4s, v14.4s -ldr q14, [x0, #880] -sqrdmulh v13.4S, v14.4S, v28.s[0] -sub v6.4s, v3.4s, v2.4s -str q10, [x0, #416] -mul v14.4S, v14.4S,v29.s[0] -add v3.4s, v3.4s, v2.4s -ldr q2, [x0, #816] -sqrdmulh v10.4S, v2.4S, v28.s[0] -sub v21.4s, v26.4s, v0.4s -str q6, [x0, #352] -mul v2.4S, v2.4S,v29.s[0] -add v26.4s, v26.4s, v0.4s -ldr q0, [x0, #752] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v15.4s, v24.4s -str q3, [x0, #288] -sqrdmulh v3.4S, v0.4S, v28.s[0] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #688] -mla v22.4S, v8.4S, v31.s[0] -str q21, [x0, #224] -sqrdmulh v21.4S, v24.4S, v28.s[0] -nop -ldr q8, [x0, #624] -mla v14.4S, v13.4S, v31.s[0] -str q26, [x0, #160] -sqrdmulh v26.4S, v8.4S, v28.s[0] -nop -ldr q13, [x0, #560] -mla v2.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v13.4S, v28.s[0] -str q16, [x0, #96] -ldr q16, [x0, #496] -ldr q6, [x0, #432] -mul v0.4S, v0.4S,v29.s[0] -sub v18.4s, v16.4s, v19.4s -str q15, [x0, #32] -mul v24.4S, v24.4S,v29.s[0] -add v16.4s, v16.4s, v19.4s -ldr q19, [x0, #368] -ldr q15, [x0, #304] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v22.4s -mla v24.4S, v21.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -ldr q22, [x0, #240] -ldr q21, [x0, #176] -mul v8.4S, v8.4S,v29.s[0] -sub v27.4s, v19.4s, v14.4s -mul v13.4S, v13.4S,v29.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #112] -ldr q25, [x0, #48] -mla v8.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v2.4s -mla v13.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -sqrdmulh v2.4S, v18.4S, v28.s[2] -nop -mul v18.4S, v18.4S,v29.s[2] -nop -sqrdmulh v10.4S, v3.4S, v28.s[2] -sub v23.4s, v22.4s, v0.4s -mul v3.4S, v3.4S,v29.s[2] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v28.s[1] -sub v20.4s, v21.4s, v24.4s -mul v16.4S, v16.4S,v29.s[1] -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v6.4S, v28.s[1] -sub v12.4s, v14.4s, v8.4s -mul v6.4S, v6.4S,v29.s[1] -add v14.4s, v14.4s, v8.4s -mla v18.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v13.4s -sqrdmulh v8.4S, v27.4S, v28.s[2] -add v25.4s, v25.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v26.4S, v28.s[2] -nop -mla v16.4S, v0.4S, v31.s[0] -nop -sqrdmulh v0.4S, v19.4S, v28.s[1] -nop -mla v6.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v15.4S, v28.s[1] -nop -mul v27.4S, v27.4S,v29.s[2] -sub v13.4s, v23.4s, v18.4s -mul v26.4S, v26.4S,v29.s[2] -add v23.4s, v23.4s, v18.4s -mla v27.4S, v8.4S, v31.s[0] -sub v8.4s, v20.4s, v3.4s -mla v26.4S, v10.4S, v31.s[0] -add v20.4s, v20.4s, v3.4s -mul v19.4S, v19.4S,v29.s[1] -sub v3.4s, v22.4s, v16.4s -mul v15.4S, v15.4S,v29.s[1] -add v22.4s, v22.4s, v16.4s -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v6.4s -mla v15.4S, v24.4S, v31.s[0] -add v21.4s, v21.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v11.s[3] -nop -mul v13.4S, v13.4S,v17.s[3] -nop -sqrdmulh v24.4S, v23.4S, v11.s[2] -sub v16.4s, v12.4s, v27.4s -mul v23.4S, v23.4S,v17.s[2] -add v12.4s, v12.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v11.s[1] -sub v10.4s, v2.4s, v26.4s -mul v3.4S, v3.4S,v17.s[1] -add v2.4s, v2.4s, v26.4s -sqrdmulh v26.4S, v22.4S, v11.s[0] -sub v18.4s, v14.4s, v19.4s -mul v22.4S, v22.4S,v17.s[0] -add v14.4s, v14.4s, v19.4s -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v25.4s, v15.4s -sqrdmulh v19.4S, v8.4S, v11.s[3] -add v25.4s, v25.4s, v15.4s -mla v23.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v20.4S, v11.s[2] -nop -mla v3.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v0.4S, v11.s[1] -nop -mla v22.4S, v26.4S, v31.s[0] -nop -sqrdmulh v26.4S, v21.4S, v11.s[0] -nop -mul v8.4S, v8.4S,v17.s[3] -sub v15.4s, v16.4s, v13.4s -mul v20.4S, v20.4S,v17.s[2] -add v16.4s, v16.4s, v13.4s -mla v8.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v23.4s -mla v20.4S, v24.4S, v31.s[0] -add v12.4s, v12.4s, v23.4s -mul v0.4S, v0.4S,v17.s[1] -sub v23.4s, v18.4s, v3.4s -mul v21.4S, v21.4S,v17.s[0] -add v18.4s, v18.4s, v3.4s -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v14.4s, v22.4s -mla v21.4S, v26.4S, v31.s[0] -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v15.4S, v9.s[3] -nop -mul v15.4S, v15.4S,v30.s[3] -nop -sqrdmulh v26.4S, v16.4S, v9.s[2] -sub v3.4s, v10.4s, v8.4s -mul v16.4S, v16.4S,v30.s[2] -add v10.4s, v10.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v9.s[1] -sub v24.4s, v2.4s, v20.4s -mul v19.4S, v19.4S,v30.s[1] -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v9.s[0] -sub v13.4s, v6.4s, v0.4s -mul v12.4S, v12.4S,v30.s[0] -add v6.4s, v6.4s, v0.4s -mla v15.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v21.4s -sqrdmulh v0.4S, v23.4S, v7.s[3] -add v25.4s, v25.4s, v21.4s -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v3.4s, v15.4s -sqrdmulh v21.4S, v18.4S, v7.s[2] -add v3.4s, v3.4s, v15.4s -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v16.4s -sqrdmulh v15.4S, v27.4S, v7.s[1] -add v10.4s, v10.4s, v16.4s -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v19.4s -sqrdmulh v16.4S, v14.4S, v7.s[0] -add v24.4s, v24.4s, v19.4s -mul v23.4S, v23.4S,v1.s[3] -sub v19.4s, v2.4s, v12.4s -mul v18.4S, v18.4S,v1.s[2] -add v2.4s, v2.4s, v12.4s -mla v23.4S, v0.4S, v31.s[0] -str q26, [x0, #1008] -mla v18.4S, v21.4S, v31.s[0] -str q3, [x0, #944] -mul v27.4S, v27.4S,v1.s[1] -str q8, [x0, #880] -mul v14.4S, v14.4S,v1.s[0] -str q10, [x0, #816] -mla v27.4S, v15.4S, v31.s[0] -str q20, [x0, #752] -mla v14.4S, v16.4S, v31.s[0] -str q24, [x0, #688] -ldr q24, [x0, #960] -sqrdmulh v16.4S, v24.4S, v28.s[0] -str q19, [x0, #624] -mul v24.4S, v24.4S,v29.s[0] -str q2, [x0, #560] -ldr q2, [x0, #896] -sqrdmulh v19.4S, v2.4S, v28.s[0] -sub v20.4s, v13.4s, v23.4s -str q20, [x0, #496] -mul v2.4S, v2.4S,v29.s[0] -add v13.4s, v13.4s, v23.4s -ldr q23, [x0, #832] -sqrdmulh v20.4S, v23.4S, v28.s[0] -sub v15.4s, v6.4s, v18.4s -str q13, [x0, #432] -mul v23.4S, v23.4S,v29.s[0] -add v6.4s, v6.4s, v18.4s -ldr q18, [x0, #768] -sqrdmulh v13.4S, v18.4S, v28.s[0] -sub v10.4s, v22.4s, v27.4s -str q15, [x0, #368] -mul v18.4S, v18.4S,v29.s[0] -add v22.4s, v22.4s, v27.4s -ldr q27, [x0, #704] -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v14.4s -str q6, [x0, #304] -sqrdmulh v6.4S, v27.4S, v28.s[0] -add v25.4s, v25.4s, v14.4s -ldr q14, [x0, #640] -mla v2.4S, v19.4S, v31.s[0] -str q10, [x0, #240] -sqrdmulh v10.4S, v14.4S, v28.s[0] -nop -ldr q19, [x0, #576] -mla v23.4S, v20.4S, v31.s[0] -str q22, [x0, #176] -sqrdmulh v22.4S, v19.4S, v28.s[0] -nop -ldr q20, [x0, #512] -mla v18.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v20.4S, v28.s[0] -str q16, [x0, #112] -ldr q16, [x0, #448] -ldr q15, [x0, #384] -mul v27.4S, v27.4S,v29.s[0] -sub v8.4s, v16.4s, v24.4s -str q25, [x0, #48] -mul v14.4S, v14.4S,v29.s[0] -add v16.4s, v16.4s, v24.4s -ldr q24, [x0, #320] -ldr q25, [x0, #256] -mla v27.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v2.4s -mla v14.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v2.4s -ldr q2, [x0, #192] -ldr q10, [x0, #128] -mul v19.4S, v19.4S,v29.s[0] -sub v3.4s, v24.4s, v23.4s -mul v20.4S, v20.4S,v29.s[0] -add v24.4s, v24.4s, v23.4s -ldr q23, [x0, #64] -ldr q21, [x0, #0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v25.4s, v18.4s -mla v20.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v28.s[2] -nop -mul v8.4S, v8.4S,v29.s[2] -nop -sqrdmulh v13.4S, v6.4S, v28.s[2] -sub v26.4s, v2.4s, v27.4s -mul v6.4S, v6.4S,v29.s[2] -add v2.4s, v2.4s, v27.4s -sqrdmulh v27.4S, v16.4S, v28.s[1] -sub v0.4s, v10.4s, v14.4s -mul v16.4S, v16.4S,v29.s[1] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v28.s[1] -sub v12.4s, v23.4s, v19.4s -mul v15.4S, v15.4S,v29.s[1] -add v23.4s, v23.4s, v19.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v20.4s -sqrdmulh v19.4S, v3.4S, v28.s[2] -add v21.4s, v21.4s, v20.4s -mla v6.4S, v13.4S, v31.s[0] -nop -sqrdmulh v13.4S, v22.4S, v28.s[2] -nop -mla v16.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v24.4S, v28.s[1] -nop -mla v15.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v25.4S, v28.s[1] -nop -mul v3.4S, v3.4S,v29.s[2] -sub v20.4s, v26.4s, v8.4s -mul v22.4S, v22.4S,v29.s[2] -add v26.4s, v26.4s, v8.4s -mla v3.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v6.4s -mla v22.4S, v13.4S, v31.s[0] -add v0.4s, v0.4s, v6.4s -mul v24.4S, v24.4S,v29.s[1] -sub v6.4s, v2.4s, v16.4s -mul v25.4S, v25.4S,v29.s[1] -add v2.4s, v2.4s, v16.4s -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v10.4s, v15.4s -mla v25.4S, v14.4S, v31.s[0] -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v11.s[3] -nop -mul v20.4S, v20.4S,v17.s[3] -nop -sqrdmulh v14.4S, v26.4S, v11.s[2] -sub v16.4s, v12.4s, v3.4s -mul v26.4S, v26.4S,v17.s[2] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v6.4S, v11.s[1] -sub v13.4s, v18.4s, v22.4s -mul v6.4S, v6.4S,v17.s[1] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v11.s[0] -sub v8.4s, v23.4s, v24.4s -mul v2.4S, v2.4S,v17.s[0] -add v23.4s, v23.4s, v24.4s -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v21.4s, v25.4s -sqrdmulh v24.4S, v19.4S, v11.s[3] -add v21.4s, v21.4s, v25.4s -mla v26.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v0.4S, v11.s[2] -nop -mla v6.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v27.4S, v11.s[1] -nop -mla v2.4S, v22.4S, v31.s[0] -nop -sqrdmulh v22.4S, v10.4S, v11.s[0] -nop -mul v19.4S, v19.4S,v17.s[3] -sub v25.4s, v16.4s, v20.4s -mul v0.4S, v0.4S,v17.s[2] -add v16.4s, v16.4s, v20.4s -mla v19.4S, v24.4S, v31.s[0] -sub v24.4s, v12.4s, v26.4s -mla v0.4S, v14.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -mul v27.4S, v27.4S,v17.s[1] -sub v26.4s, v8.4s, v6.4s -mul v10.4S, v10.4S,v17.s[0] -add v8.4s, v8.4s, v6.4s -mla v27.4S, v3.4S, v31.s[0] -sub v3.4s, v23.4s, v2.4s -mla v10.4S, v22.4S, v31.s[0] -add v23.4s, v23.4s, v2.4s -sqrdmulh v2.4S, v25.4S, v9.s[3] -nop -mul v25.4S, v25.4S,v30.s[3] -nop -sqrdmulh v22.4S, v16.4S, v9.s[2] -sub v6.4s, v13.4s, v19.4s -mul v16.4S, v16.4S,v30.s[2] -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v24.4S, v9.s[1] -sub v14.4s, v18.4s, v0.4s -mul v24.4S, v24.4S,v30.s[1] -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v9.s[0] -sub v20.4s, v15.4s, v27.4s -mul v12.4S, v12.4S,v30.s[0] -add v15.4s, v15.4s, v27.4s -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v10.4s -sqrdmulh v27.4S, v26.4S, v7.s[3] -add v21.4s, v21.4s, v10.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v6.4s, v25.4s -sqrdmulh v10.4S, v8.4S, v7.s[2] -add v6.4s, v6.4s, v25.4s -mla v24.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v16.4s -sqrdmulh v25.4S, v3.4S, v7.s[1] -add v13.4s, v13.4s, v16.4s -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v14.4s, v24.4s -sqrdmulh v16.4S, v23.4S, v7.s[0] -add v14.4s, v14.4s, v24.4s -mul v26.4S, v26.4S,v1.s[3] -sub v24.4s, v18.4s, v12.4s -mul v8.4S, v8.4S,v1.s[2] -add v18.4s, v18.4s, v12.4s -mla v26.4S, v27.4S, v31.s[0] -str q22, [x0, #960] -mla v8.4S, v10.4S, v31.s[0] -str q6, [x0, #896] -mul v3.4S, v3.4S,v1.s[1] -str q19, [x0, #832] -mul v23.4S, v23.4S,v1.s[0] -str q13, [x0, #768] -mla v3.4S, v25.4S, v31.s[0] -str q0, [x0, #704] -mla v23.4S, v16.4S, v31.s[0] -str q14, [x0, #640] -ldr q14, [x0, #976] -sqrdmulh v16.4S, v14.4S, v28.s[0] -str q24, [x0, #576] -mul v14.4S, v14.4S,v29.s[0] -str q18, [x0, #512] -ldr q18, [x0, #912] -sqrdmulh v24.4S, v18.4S, v28.s[0] -sub v0.4s, v20.4s, v26.4s -str q0, [x0, #448] -mul v18.4S, v18.4S,v29.s[0] -add v20.4s, v20.4s, v26.4s -ldr q26, [x0, #848] -sqrdmulh v0.4S, v26.4S, v28.s[0] -sub v25.4s, v15.4s, v8.4s -str q20, [x0, #384] -mul v26.4S, v26.4S,v29.s[0] -add v15.4s, v15.4s, v8.4s -ldr q8, [x0, #784] -sqrdmulh v20.4S, v8.4S, v28.s[0] -sub v13.4s, v2.4s, v3.4s -str q25, [x0, #320] -mul v8.4S, v8.4S,v29.s[0] -add v2.4s, v2.4s, v3.4s -ldr q3, [x0, #720] -mla v14.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v23.4s -str q15, [x0, #256] -sqrdmulh v15.4S, v3.4S, v28.s[0] -add v21.4s, v21.4s, v23.4s -ldr q23, [x0, #656] -mla v18.4S, v24.4S, v31.s[0] -str q13, [x0, #192] -sqrdmulh v13.4S, v23.4S, v28.s[0] -nop -ldr q24, [x0, #592] -mla v26.4S, v0.4S, v31.s[0] -str q2, [x0, #128] -sqrdmulh v2.4S, v24.4S, v28.s[0] -nop -ldr q0, [x0, #528] -mla v8.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v0.4S, v28.s[0] -str q16, [x0, #64] -ldr q16, [x0, #464] -ldr q25, [x0, #400] -mul v3.4S, v3.4S,v29.s[0] -sub v19.4s, v16.4s, v14.4s -str q21, [x0, #0] -mul v23.4S, v23.4S,v29.s[0] -add v16.4s, v16.4s, v14.4s -ldr q14, [x0, #336] -ldr q21, [x0, #272] -mla v3.4S, v15.4S, v31.s[0] -sub v15.4s, v25.4s, v18.4s -mla v23.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v18.4s -ldr q18, [x0, #208] -ldr q13, [x0, #144] -mul v24.4S, v24.4S,v29.s[0] -sub v6.4s, v14.4s, v26.4s -mul v0.4S, v0.4S,v29.s[0] -add v14.4s, v14.4s, v26.4s -ldr q26, [x0, #80] -ldr q10, [x0, #16] -mla v24.4S, v2.4S, v31.s[0] -sub v2.4s, v21.4s, v8.4s -mla v0.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v28.s[2] -nop -mul v19.4S, v19.4S,v29.s[2] -nop -sqrdmulh v20.4S, v15.4S, v28.s[2] -sub v22.4s, v18.4s, v3.4s -mul v15.4S, v15.4S,v29.s[2] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v16.4S, v28.s[1] -sub v27.4s, v13.4s, v23.4s -mul v16.4S, v16.4S,v29.s[1] -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v25.4S, v28.s[1] -sub v12.4s, v26.4s, v24.4s -mul v25.4S, v25.4S,v29.s[1] -add v26.4s, v26.4s, v24.4s -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v0.4s -sqrdmulh v24.4S, v6.4S, v28.s[2] -add v10.4s, v10.4s, v0.4s -mla v15.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v2.4S, v28.s[2] -nop -mla v16.4S, v3.4S, v31.s[0] -nop -sqrdmulh v3.4S, v14.4S, v28.s[1] -nop -mla v25.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v21.4S, v28.s[1] -nop -mul v6.4S, v6.4S,v29.s[2] -sub v0.4s, v22.4s, v19.4s -mul v2.4S, v2.4S,v29.s[2] -add v22.4s, v22.4s, v19.4s -mla v6.4S, v24.4S, v31.s[0] -sub v24.4s, v27.4s, v15.4s -mla v2.4S, v20.4S, v31.s[0] -add v27.4s, v27.4s, v15.4s -mul v14.4S, v14.4S,v29.s[1] -sub v15.4s, v18.4s, v16.4s -mul v21.4S, v21.4S,v29.s[1] -add v18.4s, v18.4s, v16.4s -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v25.4s -mla v21.4S, v23.4S, v31.s[0] -add v13.4s, v13.4s, v25.4s -sqrdmulh v28.4S, v0.4S, v11.s[3] -nop -mul v0.4S, v0.4S,v17.s[3] -nop -sqrdmulh v29.4S, v22.4S, v11.s[2] -sub v25.4s, v12.4s, v6.4s -mul v22.4S, v22.4S,v17.s[2] -add v12.4s, v12.4s, v6.4s -sqrdmulh v6.4S, v15.4S, v11.s[1] -sub v23.4s, v8.4s, v2.4s -mul v15.4S, v15.4S,v17.s[1] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v18.4S, v11.s[0] -sub v16.4s, v26.4s, v14.4s -mul v18.4S, v18.4S,v17.s[0] -add v26.4s, v26.4s, v14.4s -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v10.4s, v21.4s -sqrdmulh v14.4S, v24.4S, v11.s[3] -add v10.4s, v10.4s, v21.4s -mla v22.4S, v29.4S, v31.s[0] -nop -sqrdmulh v29.4S, v27.4S, v11.s[2] -nop -mla v15.4S, v6.4S, v31.s[0] -nop -sqrdmulh v6.4S, v3.4S, v11.s[1] -nop -mla v18.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v13.4S, v11.s[0] -nop -mul v24.4S, v24.4S,v17.s[3] -sub v21.4s, v25.4s, v0.4s -mul v27.4S, v27.4S,v17.s[2] -add v25.4s, v25.4s, v0.4s -mla v24.4S, v14.4S, v31.s[0] -sub v14.4s, v12.4s, v22.4s -mla v27.4S, v29.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -mul v3.4S, v3.4S,v17.s[1] -sub v22.4s, v16.4s, v15.4s -mul v13.4S, v13.4S,v17.s[0] -add v16.4s, v16.4s, v15.4s -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v18.4s -mla v13.4S, v2.4S, v31.s[0] -add v26.4s, v26.4s, v18.4s -sqrdmulh v11.4S, v21.4S, v9.s[3] -nop -mul v21.4S, v21.4S,v30.s[3] -nop -sqrdmulh v17.4S, v25.4S, v9.s[2] -sub v18.4s, v23.4s, v24.4s -mul v25.4S, v25.4S,v30.s[2] -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v14.4S, v9.s[1] -sub v2.4s, v8.4s, v27.4s -mul v14.4S, v14.4S,v30.s[1] -add v8.4s, v8.4s, v27.4s -sqrdmulh v27.4S, v12.4S, v9.s[0] -sub v15.4s, v28.4s, v3.4s -mul v12.4S, v12.4S,v30.s[0] -add v28.4s, v28.4s, v3.4s -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v13.4s -sqrdmulh v9.4S, v22.4S, v7.s[3] -add v10.4s, v10.4s, v13.4s -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v21.4s -sqrdmulh v13.4S, v16.4S, v7.s[2] -add v18.4s, v18.4s, v21.4s -mla v14.4S, v24.4S, v31.s[0] -sub v24.4s, v23.4s, v25.4s -sqrdmulh v21.4S, v6.4S, v7.s[1] -add v23.4s, v23.4s, v25.4s -mla v12.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v14.4s -sqrdmulh v25.4S, v26.4S, v7.s[0] -add v2.4s, v2.4s, v14.4s -mul v22.4S, v22.4S,v1.s[3] -sub v14.4s, v8.4s, v12.4s -mul v16.4S, v16.4S,v1.s[2] -add v8.4s, v8.4s, v12.4s -mla v22.4S, v9.4S, v31.s[0] -str q17, [x0, #976] -mla v16.4S, v13.4S, v31.s[0] -str q18, [x0, #912] -mul v6.4S, v6.4S,v1.s[1] -str q24, [x0, #848] -mul v26.4S, v26.4S,v1.s[0] -str q23, [x0, #784] -mla v6.4S, v21.4S, v31.s[0] -str q27, [x0, #720] -mla v26.4S, v25.4S, v31.s[0] -str q2, [x0, #656] -str q14, [x0, #592] -str q8, [x0, #528] -sub v8.4s, v15.4s, v22.4s -str q8, [x0, #464] -add v15.4s, v15.4s, v22.4s -sub v22.4s, v28.4s, v16.4s -str q15, [x0, #400] -add v28.4s, v28.4s, v16.4s -sub v16.4s, v11.4s, v6.4s -str q22, [x0, #336] -add v11.4s, v11.4s, v6.4s -sub v6.4s, v10.4s, v26.4s -str q28, [x0, #272] -add v10.4s, v10.4s, v26.4s -str q16, [x0, #208] -str q11, [x0, #144] -str q6, [x0, #80] -str q10, [x0, #16] -ldr q4, [x0, #224] -ldr q5, [x0, #160] -ldr q19, [x0, #32] -ldr q20, [x17, #+128] -ldr q0, [x17, #+144] -sqrdmulh v29.4S, v19.4S, v0.s[0] -mul v19.4S, v19.4S,v20.s[0] -ldr q3, [x0, #48] -sqrdmulh v30.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v20.s[0] -ldr q12, [x17, #+160] -ldr q9, [x17, #+176] -ldr q17, [x0, #96] -sqrdmulh v13.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v12.s[0] -ldr q18, [x0, #112] -sqrdmulh v24.4S, v18.4S, v9.s[0] -mul v18.4S, v18.4S,v12.s[0] -ldr q23, [x17, #+192] -ldr q21, [x17, #+208] -mla v19.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v5.4S, v21.s[0] -ldr q27, [x0, #176] -mla v3.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v27.4S, v21.s[0] -ldr q25, [x17, #+224] -ldr q2, [x17, #+240] -mla v17.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v4.4S, v2.s[0] -ldr q1, [x0, #240] -mla v18.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v1.4S, v2.s[0] -ldr q7, [x0, #0] -ldr q14, [x0, #128] -mul v5.4S, v5.4S,v23.s[0] -sub v8.4s, v7.4s, v19.4s -ldr q15, [x0, #16] -mul v27.4S, v27.4S,v23.s[0] -add v7.4s, v7.4s, v19.4s -ldr q19, [x0, #144] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v3.4s -ldr q22, [x0, #64] -mla v27.4S, v30.4S, v31.s[0] -add v15.4s, v15.4s, v3.4s -ldr q3, [x0, #192] -mul v4.4S, v4.4S,v25.s[0] -sub v30.4s, v22.4s, v17.4s -ldr q28, [x0, #80] -mul v1.4S, v1.4S,v25.s[0] -add v22.4s, v22.4s, v17.4s -ldr q17, [x0, #208] -mla v4.4S, v13.4S, v31.s[0] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v18.4s -sqrdmulh v13.4S, v15.4S, v0.s[1] -add v28.4s, v28.4s, v18.4s -mul v15.4S, v15.4S,v20.s[1] -sqrdmulh v18.4S, v29.4S, v0.s[2] -sub v26.4s, v14.4s, v5.4s -mul v29.4S, v29.4S,v20.s[2] -add v14.4s, v14.4s, v5.4s -sqrdmulh v0.4S, v28.4S, v9.s[1] -sub v20.4s, v19.4s, v27.4s -mul v28.4S, v28.4S,v12.s[1] -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v9.s[2] -sub v5.4s, v3.4s, v4.4s -mul v24.4S, v24.4S,v12.s[2] -add v3.4s, v3.4s, v4.4s -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v1.4s -ldr q9, [x0, #480] -sqrdmulh v12.4S, v19.4S, v21.s[1] -add v17.4s, v17.4s, v1.4s -mla v29.4S, v18.4S, v31.s[0] -ldr q18, [x0, #416] -sqrdmulh v1.4S, v20.4S, v21.s[2] -sub v4.4s, v7.4s, v15.4s -mla v28.4S, v0.4S, v31.s[0] -ldr q0, [x0, #288] -sqrdmulh v16.4S, v17.4S, v2.s[1] -add v7.4s, v7.4s, v15.4s -str q4, [x0, #16] -mla v24.4S, v27.4S, v31.s[0] -ldr q27, [x17, #+256] -ldr q4, [x17, #+272] -sqrdmulh v15.4S, v13.4S, v2.s[2] -sub v11.4s, v8.4s, v29.4s -str q7, [x0, #0] -mul v19.4S, v19.4S,v23.s[1] -add v8.4s, v8.4s, v29.4s -mul v20.4S, v20.4S,v23.s[2] -str q11, [x0, #48] -mla v19.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v28.4s -mla v20.4S, v1.4S, v31.s[0] -str q8, [x0, #32] -mul v17.4S, v17.4S,v25.s[1] -str q12, [x0, #80] -mul v13.4S, v13.4S,v25.s[2] -add v22.4s, v22.4s, v28.4s -str q22, [x0, #64] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v30.4s, v24.4s -str q16, [x0, #112] -mla v13.4S, v15.4S, v31.s[0] -add v30.4s, v30.4s, v24.4s -str q30, [x0, #96] -sqrdmulh v2.4S, v0.4S, v4.s[0] -sub v25.4s, v14.4s, v19.4s -mul v0.4S, v0.4S,v27.s[0] -str q25, [x0, #144] -ldr q25, [x0, #304] -sqrdmulh v30.4S, v25.4S, v4.s[0] -add v14.4s, v14.4s, v19.4s -mul v25.4S, v25.4S,v27.s[0] -str q14, [x0, #128] -ldr q14, [x17, #+288] -ldr q19, [x17, #+304] -ldr q24, [x0, #352] -sqrdmulh v15.4S, v24.4S, v19.s[0] -sub v16.4s, v26.4s, v20.4s -mul v24.4S, v24.4S,v14.s[0] -str q16, [x0, #176] -ldr q16, [x0, #368] -sqrdmulh v22.4S, v16.4S, v19.s[0] -add v26.4s, v26.4s, v20.4s -mul v16.4S, v16.4S,v14.s[0] -str q26, [x0, #160] -ldr q26, [x17, #+320] -ldr q20, [x17, #+336] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v3.4s, v17.4s -sqrdmulh v28.4S, v18.4S, v20.s[0] -str q2, [x0, #208] -ldr q2, [x0, #432] -mla v25.4S, v30.4S, v31.s[0] -add v3.4s, v3.4s, v17.4s -sqrdmulh v17.4S, v2.4S, v20.s[0] -str q3, [x0, #192] -ldr q3, [x17, #+352] -ldr q30, [x17, #+368] -mla v24.4S, v15.4S, v31.s[0] -sub v15.4s, v5.4s, v13.4s -sqrdmulh v12.4S, v9.4S, v30.s[0] -str q15, [x0, #240] -ldr q15, [x0, #496] -mla v16.4S, v22.4S, v31.s[0] -add v5.4s, v5.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v30.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q22, [x0, #384] -mul v18.4S, v18.4S,v26.s[0] -sub v21.4s, v5.4s, v0.4s -ldr q23, [x0, #272] -mul v2.4S, v2.4S,v26.s[0] -add v5.4s, v5.4s, v0.4s -ldr q0, [x0, #400] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v23.4s, v25.4s -ldr q8, [x0, #320] -mla v2.4S, v17.4S, v31.s[0] -add v23.4s, v23.4s, v25.4s -ldr q25, [x0, #448] -mul v9.4S, v9.4S,v3.s[0] -sub v17.4s, v8.4s, v24.4s -ldr q1, [x0, #336] -mul v15.4S, v15.4S,v3.s[0] -add v8.4s, v8.4s, v24.4s -ldr q24, [x0, #464] -mla v9.4S, v12.4S, v31.s[0] -mla v15.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v16.4s -sqrdmulh v12.4S, v23.4S, v4.s[1] -add v1.4s, v1.4s, v16.4s -mul v23.4S, v23.4S,v27.s[1] -sqrdmulh v16.4S, v28.4S, v4.s[2] -sub v11.4s, v22.4s, v18.4s -mul v28.4S, v28.4S,v27.s[2] -add v22.4s, v22.4s, v18.4s -sqrdmulh v4.4S, v1.4S, v19.s[1] -sub v27.4s, v0.4s, v2.4s -mul v1.4S, v1.4S,v14.s[1] -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v19.s[2] -sub v18.4s, v25.4s, v9.4s -mul v13.4S, v13.4S,v14.s[2] -add v25.4s, v25.4s, v9.4s -mla v23.4S, v12.4S, v31.s[0] -sub v12.4s, v24.4s, v15.4s -ldr q19, [x0, #736] -sqrdmulh v14.4S, v0.4S, v20.s[1] -add v24.4s, v24.4s, v15.4s -mla v28.4S, v16.4S, v31.s[0] -ldr q16, [x0, #672] -sqrdmulh v15.4S, v27.4S, v20.s[2] -sub v9.4s, v5.4s, v23.4s -mla v1.4S, v4.4S, v31.s[0] -ldr q4, [x0, #544] -sqrdmulh v29.4S, v24.4S, v30.s[1] -add v5.4s, v5.4s, v23.4s -str q9, [x0, #272] -mla v13.4S, v2.4S, v31.s[0] -ldr q2, [x17, #+384] -ldr q9, [x17, #+400] -sqrdmulh v23.4S, v12.4S, v30.s[2] -sub v7.4s, v21.4s, v28.4s -str q5, [x0, #256] -mul v0.4S, v0.4S,v26.s[1] -add v21.4s, v21.4s, v28.4s -mul v27.4S, v27.4S,v26.s[2] -str q7, [x0, #304] -mla v0.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v1.4s -mla v27.4S, v15.4S, v31.s[0] -str q21, [x0, #288] -mul v24.4S, v24.4S,v3.s[1] -str q14, [x0, #336] -mul v12.4S, v12.4S,v3.s[2] -add v8.4s, v8.4s, v1.4s -str q8, [x0, #320] -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v17.4s, v13.4s -str q29, [x0, #368] -mla v12.4S, v23.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q17, [x0, #352] -sqrdmulh v30.4S, v4.4S, v9.s[0] -sub v3.4s, v22.4s, v0.4s -mul v4.4S, v4.4S,v2.s[0] -str q3, [x0, #400] -ldr q3, [x0, #560] -sqrdmulh v17.4S, v3.4S, v9.s[0] -add v22.4s, v22.4s, v0.4s -mul v3.4S, v3.4S,v2.s[0] -str q22, [x0, #384] -ldr q22, [x17, #+416] -ldr q0, [x17, #+432] -ldr q13, [x0, #608] -sqrdmulh v23.4S, v13.4S, v0.s[0] -sub v29.4s, v11.4s, v27.4s -mul v13.4S, v13.4S,v22.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v8.4S, v29.4S, v0.s[0] -add v11.4s, v11.4s, v27.4s -mul v29.4S, v29.4S,v22.s[0] -str q11, [x0, #416] -ldr q11, [x17, #+448] -ldr q27, [x17, #+464] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v25.4s, v24.4s -sqrdmulh v1.4S, v16.4S, v27.s[0] -str q30, [x0, #464] -ldr q30, [x0, #688] -mla v3.4S, v17.4S, v31.s[0] -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v30.4S, v27.s[0] -str q25, [x0, #448] -ldr q25, [x17, #+480] -ldr q17, [x17, #+496] -mla v13.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v12.4s -sqrdmulh v14.4S, v19.4S, v17.s[0] -str q23, [x0, #496] -ldr q23, [x0, #752] -mla v29.4S, v8.4S, v31.s[0] -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v23.4S, v17.s[0] -str q18, [x0, #480] -ldr q18, [x0, #512] -ldr q8, [x0, #640] -mul v16.4S, v16.4S,v11.s[0] -sub v20.4s, v18.4s, v4.4s -ldr q26, [x0, #528] -mul v30.4S, v30.4S,v11.s[0] -add v18.4s, v18.4s, v4.4s -ldr q4, [x0, #656] -mla v16.4S, v1.4S, v31.s[0] -sub v1.4s, v26.4s, v3.4s -ldr q21, [x0, #576] -mla v30.4S, v24.4S, v31.s[0] -add v26.4s, v26.4s, v3.4s -ldr q3, [x0, #704] -mul v19.4S, v19.4S,v25.s[0] -sub v24.4s, v21.4s, v13.4s -ldr q15, [x0, #592] -mul v23.4S, v23.4S,v25.s[0] -add v21.4s, v21.4s, v13.4s -ldr q13, [x0, #720] -mla v19.4S, v14.4S, v31.s[0] -mla v23.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v29.4s -sqrdmulh v14.4S, v26.4S, v9.s[1] -add v15.4s, v15.4s, v29.4s -mul v26.4S, v26.4S,v2.s[1] -sqrdmulh v29.4S, v1.4S, v9.s[2] -sub v7.4s, v8.4s, v16.4s -mul v1.4S, v1.4S,v2.s[2] -add v8.4s, v8.4s, v16.4s -sqrdmulh v9.4S, v15.4S, v0.s[1] -sub v2.4s, v4.4s, v30.4s -mul v15.4S, v15.4S,v22.s[1] -add v4.4s, v4.4s, v30.4s -sqrdmulh v30.4S, v12.4S, v0.s[2] -sub v16.4s, v3.4s, v19.4s -mul v12.4S, v12.4S,v22.s[2] -add v3.4s, v3.4s, v19.4s -mla v26.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v23.4s -ldr q0, [x0, #992] -sqrdmulh v22.4S, v4.4S, v27.s[1] -add v13.4s, v13.4s, v23.4s -mla v1.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v23.4S, v2.4S, v27.s[2] -sub v19.4s, v18.4s, v26.4s -mla v15.4S, v9.4S, v31.s[0] -ldr q9, [x0, #800] -sqrdmulh v28.4S, v13.4S, v17.s[1] -add v18.4s, v18.4s, v26.4s -str q19, [x0, #528] -mla v12.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+512] -ldr q19, [x17, #+528] -sqrdmulh v26.4S, v14.4S, v17.s[2] -sub v5.4s, v20.4s, v1.4s -str q18, [x0, #512] -mul v4.4S, v4.4S,v11.s[1] -add v20.4s, v20.4s, v1.4s -mul v2.4S, v2.4S,v11.s[2] -str q5, [x0, #560] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v15.4s -mla v2.4S, v23.4S, v31.s[0] -str q20, [x0, #544] -mul v13.4S, v13.4S,v25.s[1] -str q22, [x0, #592] -mul v14.4S, v14.4S,v25.s[2] -add v21.4s, v21.4s, v15.4s -str q21, [x0, #576] -mla v13.4S, v28.4S, v31.s[0] -sub v28.4s, v24.4s, v12.4s -str q28, [x0, #624] -mla v14.4S, v26.4S, v31.s[0] -add v24.4s, v24.4s, v12.4s -str q24, [x0, #608] -sqrdmulh v17.4S, v9.4S, v19.s[0] -sub v25.4s, v8.4s, v4.4s -mul v9.4S, v9.4S,v30.s[0] -str q25, [x0, #656] -ldr q25, [x0, #816] -sqrdmulh v24.4S, v25.4S, v19.s[0] -add v8.4s, v8.4s, v4.4s -mul v25.4S, v25.4S,v30.s[0] -str q8, [x0, #640] -ldr q8, [x17, #+544] -ldr q4, [x17, #+560] -ldr q12, [x0, #864] -sqrdmulh v26.4S, v12.4S, v4.s[0] -sub v28.4s, v7.4s, v2.4s -mul v12.4S, v12.4S,v8.s[0] -str q28, [x0, #688] -ldr q28, [x0, #880] -sqrdmulh v21.4S, v28.4S, v4.s[0] -add v7.4s, v7.4s, v2.4s -mul v28.4S, v28.4S,v8.s[0] -str q7, [x0, #672] -ldr q7, [x17, #+576] -ldr q2, [x17, #+592] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v13.4s -sqrdmulh v15.4S, v29.4S, v2.s[0] -str q17, [x0, #720] -ldr q17, [x0, #944] -mla v25.4S, v24.4S, v31.s[0] -add v3.4s, v3.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v2.s[0] -str q3, [x0, #704] -ldr q3, [x17, #+608] -ldr q24, [x17, #+624] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v14.4s -sqrdmulh v22.4S, v0.4S, v24.s[0] -str q26, [x0, #752] -ldr q26, [x0, #1008] -mla v28.4S, v21.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v24.s[0] -str q16, [x0, #736] -ldr q16, [x0, #768] -ldr q21, [x0, #896] -mul v29.4S, v29.4S,v7.s[0] -sub v27.4s, v16.4s, v9.4s -ldr q11, [x0, #784] -mul v17.4S, v17.4S,v7.s[0] -add v16.4s, v16.4s, v9.4s -ldr q9, [x0, #912] -mla v29.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v25.4s -ldr q20, [x0, #832] -mla v17.4S, v13.4S, v31.s[0] -add v11.4s, v11.4s, v25.4s -ldr q25, [x0, #960] -mul v0.4S, v0.4S,v3.s[0] -sub v13.4s, v20.4s, v12.4s -ldr q23, [x0, #848] -mul v26.4S, v26.4S,v3.s[0] -add v20.4s, v20.4s, v12.4s -ldr q12, [x0, #976] -mla v0.4S, v22.4S, v31.s[0] -mla v26.4S, v14.4S, v31.s[0] -sub v14.4s, v23.4s, v28.4s -sqrdmulh v22.4S, v11.4S, v19.s[1] -add v23.4s, v23.4s, v28.4s -mul v11.4S, v11.4S,v30.s[1] -sqrdmulh v28.4S, v15.4S, v19.s[2] -sub v5.4s, v21.4s, v29.4s -mul v15.4S, v15.4S,v30.s[2] -add v21.4s, v21.4s, v29.4s -sqrdmulh v19.4S, v23.4S, v4.s[1] -sub v30.4s, v9.4s, v17.4s -mul v23.4S, v23.4S,v8.s[1] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v4.s[2] -sub v29.4s, v25.4s, v0.4s -mul v14.4S, v14.4S,v8.s[2] -add v25.4s, v25.4s, v0.4s -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v12.4s, v26.4s -sqrdmulh v4.4S, v9.4S, v2.s[1] -add v12.4s, v12.4s, v26.4s -mla v15.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v30.4S, v2.s[2] -sub v26.4s, v16.4s, v11.4s -mla v23.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v12.4S, v24.s[1] -add v16.4s, v16.4s, v11.4s -str q26, [x0, #784] -mla v14.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v22.4S, v24.s[2] -sub v26.4s, v27.4s, v15.4s -str q16, [x0, #768] -mul v9.4S, v9.4S,v7.s[1] -add v27.4s, v27.4s, v15.4s -mul v30.4S, v30.4S,v7.s[2] -str q26, [x0, #816] -mla v9.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v23.4s -mla v30.4S, v28.4S, v31.s[0] -str q27, [x0, #800] -mul v12.4S, v12.4S,v3.s[1] -str q4, [x0, #848] -mul v22.4S, v22.4S,v3.s[2] -add v20.4s, v20.4s, v23.4s -str q20, [x0, #832] -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v13.4s, v14.4s -str q19, [x0, #880] -mla v22.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q13, [x0, #864] -sub v24.4s, v21.4s, v9.4s -str q24, [x0, #912] -add v21.4s, v21.4s, v9.4s -str q21, [x0, #896] -sub v21.4s, v5.4s, v30.4s -str q21, [x0, #944] -add v5.4s, v5.4s, v30.4s -str q5, [x0, #928] -sub v5.4s, v25.4s, v12.4s -str q5, [x0, #976] -add v25.4s, v25.4s, v12.4s -str q25, [x0, #960] -sub v25.4s, v29.4s, v22.4s -str q25, [x0, #1008] -add v29.4s, v29.4s, v22.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1548 -// Instruction count: 1544 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s deleted file mode 100644 index 83076cb..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_17_z4_7.s +++ /dev/null @@ -1,1558 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_17_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -mla v28.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v20.4S, v29.s[0] -ldr q19, [x0, #672] -mla v26.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v19.4S, v29.s[0] -ldr q18, [x0, #608] -mla v24.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v18.4S, v29.s[0] -ldr q17, [x0, #544] -mla v22.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v29.s[0] -ldr q16, [x0, #480] -ldr q3, [x0, #416] -mul v20.4S, v20.4S,v30.s[0] -sub v2.4s, v16.4s, v28.4s -mul v19.4S, v19.4S,v30.s[0] -add v16.4s, v16.4s, v28.4s -ldr q28, [x0, #352] -ldr q1, [x0, #288] -mla v20.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v26.4s -mla v19.4S, v25.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q25, [x0, #160] -mul v18.4S, v18.4S,v30.s[0] -sub v0.4s, v28.4s, v24.4s -mul v17.4S, v17.4S,v30.s[0] -add v28.4s, v28.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v18.4S, v23.4S, v31.s[0] -sub v23.4s, v1.4s, v22.4s -mla v17.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v29.s[2] -nop -mul v2.4S, v2.4S,v30.s[2] -nop -sqrdmulh v21.4S, v27.4S, v29.s[2] -sub v14.4s, v26.4s, v20.4s -mul v27.4S, v27.4S,v30.s[2] -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v29.s[1] -sub v13.4s, v25.4s, v19.4s -mul v16.4S, v16.4S,v30.s[1] -add v25.4s, v25.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -sub v12.4s, v24.4s, v18.4s -mul v3.4S, v3.4S,v30.s[1] -add v24.4s, v24.4s, v18.4s -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v17.4s -sqrdmulh v18.4S, v0.4S, v29.s[2] -add v15.4s, v15.4s, v17.4s -mla v27.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v23.4S, v29.s[2] -nop -mla v16.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v28.4S, v29.s[1] -nop -mla v3.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v1.4S, v29.s[1] -nop -ldr q17, [x17, #+32] -ldr q11, [x17, #+48] -mul v0.4S, v0.4S,v30.s[2] -sub v10.4s, v14.4s, v2.4s -mul v23.4S, v23.4S,v30.s[2] -add v14.4s, v14.4s, v2.4s -mla v0.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v27.4s -mla v23.4S, v21.4S, v31.s[0] -add v13.4s, v13.4s, v27.4s -mul v28.4S, v28.4S,v30.s[1] -sub v27.4s, v26.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -add v26.4s, v26.4s, v16.4s -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v3.4s -mla v1.4S, v19.4S, v31.s[0] -add v25.4s, v25.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v11.s[3] -nop -mul v10.4S, v10.4S,v17.s[3] -nop -sqrdmulh v19.4S, v14.4S, v11.s[2] -sub v16.4s, v12.4s, v0.4s -mul v14.4S, v14.4S,v17.s[2] -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v11.s[1] -sub v21.4s, v22.4s, v23.4s -mul v27.4S, v27.4S,v17.s[1] -add v22.4s, v22.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v11.s[0] -sub v2.4s, v24.4s, v28.4s -mul v26.4S, v26.4S,v17.s[0] -add v24.4s, v24.4s, v28.4s -ldr q28, [x17, #+96] -ldr q9, [x17, #+112] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v1.4s -sqrdmulh v8.4S, v18.4S, v11.s[3] -add v15.4s, v15.4s, v1.4s -mla v14.4S, v19.4S, v31.s[0] -nop -sqrdmulh v19.4S, v13.4S, v11.s[2] -nop -mla v27.4S, v0.4S, v31.s[0] -nop -sqrdmulh v0.4S, v20.4S, v11.s[1] -nop -mla v26.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v25.4S, v11.s[0] -nop -ldr q1, [x17, #+64] -ldr q7, [x17, #+80] -mul v18.4S, v18.4S,v17.s[3] -sub v6.4s, v16.4s, v10.4s -mul v13.4S, v13.4S,v17.s[2] -add v16.4s, v16.4s, v10.4s -mla v18.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v14.4s -mla v13.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v14.4s -mul v20.4S, v20.4S,v17.s[1] -sub v14.4s, v2.4s, v27.4s -mul v25.4S, v25.4S,v17.s[0] -add v2.4s, v2.4s, v27.4s -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v26.4s -mla v25.4S, v23.4S, v31.s[0] -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[3] -nop -mul v6.4S, v6.4S,v28.s[3] -nop -sqrdmulh v23.4S, v16.4S, v9.s[2] -sub v27.4s, v21.4s, v18.4s -mul v16.4S, v16.4S,v28.s[2] -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v9.s[1] -sub v19.4s, v22.4s, v13.4s -mul v8.4S, v8.4S,v28.s[1] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v9.s[0] -sub v10.4s, v3.4s, v20.4s -mul v12.4S, v12.4S,v28.s[0] -add v3.4s, v3.4s, v20.4s -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v25.4s -sqrdmulh v20.4S, v14.4S, v7.s[3] -add v15.4s, v15.4s, v25.4s -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v27.4s, v6.4s -sqrdmulh v25.4S, v2.4S, v7.s[2] -add v27.4s, v27.4s, v6.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v16.4s -sqrdmulh v6.4S, v0.4S, v7.s[1] -add v21.4s, v21.4s, v16.4s -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v19.4s, v8.4s -sqrdmulh v16.4S, v24.4S, v7.s[0] -add v19.4s, v19.4s, v8.4s -mul v14.4S, v14.4S,v1.s[3] -sub v8.4s, v22.4s, v12.4s -mul v2.4S, v2.4S,v1.s[2] -add v22.4s, v22.4s, v12.4s -mla v14.4S, v20.4S, v31.s[0] -str q23, [x0, #992] -mla v2.4S, v25.4S, v31.s[0] -str q27, [x0, #928] -mul v0.4S, v0.4S,v1.s[1] -str q18, [x0, #864] -mul v24.4S, v24.4S,v1.s[0] -str q21, [x0, #800] -mla v0.4S, v6.4S, v31.s[0] -str q13, [x0, #736] -mla v24.4S, v16.4S, v31.s[0] -str q19, [x0, #672] -ldr q19, [x0, #1008] -sqrdmulh v16.4S, v19.4S, v29.s[0] -str q8, [x0, #608] -mul v19.4S, v19.4S,v30.s[0] -sub v8.4s, v10.4s, v14.4s -ldr q13, [x0, #944] -sqrdmulh v6.4S, v13.4S, v29.s[0] -str q22, [x0, #544] -mul v13.4S, v13.4S,v30.s[0] -add v10.4s, v10.4s, v14.4s -ldr q14, [x0, #880] -sqrdmulh v22.4S, v14.4S, v29.s[0] -str q8, [x0, #480] -mul v14.4S, v14.4S,v30.s[0] -sub v8.4s, v3.4s, v2.4s -ldr q21, [x0, #816] -sqrdmulh v18.4S, v21.4S, v29.s[0] -str q10, [x0, #416] -mul v21.4S, v21.4S,v30.s[0] -add v3.4s, v3.4s, v2.4s -ldr q2, [x0, #752] -mla v19.4S, v16.4S, v31.s[0] -str q8, [x0, #352] -sqrdmulh v8.4S, v2.4S, v29.s[0] -sub v16.4s, v26.4s, v0.4s -ldr q10, [x0, #688] -mla v13.4S, v6.4S, v31.s[0] -str q3, [x0, #288] -sqrdmulh v3.4S, v10.4S, v29.s[0] -add v26.4s, v26.4s, v0.4s -ldr q0, [x0, #624] -mla v14.4S, v22.4S, v31.s[0] -str q16, [x0, #224] -sqrdmulh v16.4S, v0.4S, v29.s[0] -sub v22.4s, v15.4s, v24.4s -ldr q6, [x0, #560] -mla v21.4S, v18.4S, v31.s[0] -str q26, [x0, #160] -sqrdmulh v26.4S, v6.4S, v29.s[0] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #496] -ldr q18, [x0, #432] -mul v2.4S, v2.4S,v30.s[0] -sub v27.4s, v24.4s, v19.4s -mul v10.4S, v10.4S,v30.s[0] -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #368] -ldr q25, [x0, #304] -mla v2.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v13.4s -mla v10.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v13.4s -ldr q13, [x0, #240] -ldr q3, [x0, #176] -mul v0.4S, v0.4S,v30.s[0] -sub v23.4s, v19.4s, v14.4s -mul v6.4S, v6.4S,v30.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #112] -ldr q20, [x0, #48] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v21.4s -mla v6.4S, v26.4S, v31.s[0] -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -nop -mul v27.4S, v27.4S,v30.s[2] -nop -sqrdmulh v26.4S, v8.4S, v29.s[2] -sub v12.4s, v13.4s, v2.4s -mul v8.4S, v8.4S,v30.s[2] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v24.4S, v29.s[1] -sub v5.4s, v3.4s, v10.4s -mul v24.4S, v24.4S,v30.s[1] -add v3.4s, v3.4s, v10.4s -sqrdmulh v10.4S, v18.4S, v29.s[1] -sub v4.4s, v14.4s, v0.4s -mul v18.4S, v18.4S,v30.s[1] -add v14.4s, v14.4s, v0.4s -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v6.4s -sqrdmulh v0.4S, v23.4S, v29.s[2] -add v20.4s, v20.4s, v6.4s -mla v8.4S, v26.4S, v31.s[0] -str q22, [x0, #96] -sqrdmulh v22.4S, v16.4S, v29.s[2] -nop -mla v24.4S, v2.4S, v31.s[0] -str q15, [x0, #32] -sqrdmulh v15.4S, v19.4S, v29.s[1] -nop -mla v18.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v25.4S, v29.s[1] -nop -mul v23.4S, v23.4S,v30.s[2] -sub v2.4s, v12.4s, v27.4s -mul v16.4S, v16.4S,v30.s[2] -add v12.4s, v12.4s, v27.4s -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v8.4s -mla v16.4S, v22.4S, v31.s[0] -add v5.4s, v5.4s, v8.4s -mul v19.4S, v19.4S,v30.s[1] -sub v8.4s, v13.4s, v24.4s -mul v25.4S, v25.4S,v30.s[1] -add v13.4s, v13.4s, v24.4s -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v3.4s, v18.4s -mla v25.4S, v10.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v11.s[3] -nop -mul v2.4S, v2.4S,v17.s[3] -nop -sqrdmulh v10.4S, v12.4S, v11.s[2] -sub v24.4s, v4.4s, v23.4s -mul v12.4S, v12.4S,v17.s[2] -add v4.4s, v4.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v11.s[1] -sub v22.4s, v21.4s, v16.4s -mul v8.4S, v8.4S,v17.s[1] -add v21.4s, v21.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v11.s[0] -sub v27.4s, v14.4s, v19.4s -mul v13.4S, v13.4S,v17.s[0] -add v14.4s, v14.4s, v19.4s -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v25.4s -sqrdmulh v19.4S, v0.4S, v11.s[3] -add v20.4s, v20.4s, v25.4s -mla v12.4S, v10.4S, v31.s[0] -nop -sqrdmulh v10.4S, v5.4S, v11.s[2] -nop -mla v8.4S, v23.4S, v31.s[0] -nop -sqrdmulh v23.4S, v15.4S, v11.s[1] -nop -mla v13.4S, v16.4S, v31.s[0] -nop -sqrdmulh v16.4S, v3.4S, v11.s[0] -nop -mul v0.4S, v0.4S,v17.s[3] -sub v25.4s, v24.4s, v2.4s -mul v5.4S, v5.4S,v17.s[2] -add v24.4s, v24.4s, v2.4s -mla v0.4S, v19.4S, v31.s[0] -sub v19.4s, v4.4s, v12.4s -mla v5.4S, v10.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -mul v15.4S, v15.4S,v17.s[1] -sub v12.4s, v27.4s, v8.4s -mul v3.4S, v3.4S,v17.s[0] -add v27.4s, v27.4s, v8.4s -mla v15.4S, v23.4S, v31.s[0] -sub v23.4s, v14.4s, v13.4s -mla v3.4S, v16.4S, v31.s[0] -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v9.s[3] -nop -mul v25.4S, v25.4S,v28.s[3] -nop -sqrdmulh v16.4S, v24.4S, v9.s[2] -sub v8.4s, v22.4s, v0.4s -mul v24.4S, v24.4S,v28.s[2] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v9.s[1] -sub v10.4s, v21.4s, v5.4s -mul v19.4S, v19.4S,v28.s[1] -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v9.s[0] -sub v2.4s, v18.4s, v15.4s -mul v4.4S, v4.4S,v28.s[0] -add v18.4s, v18.4s, v15.4s -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v3.4s -sqrdmulh v15.4S, v12.4S, v7.s[3] -add v20.4s, v20.4s, v3.4s -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v8.4s, v25.4s -sqrdmulh v3.4S, v27.4S, v7.s[2] -add v8.4s, v8.4s, v25.4s -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v24.4s -sqrdmulh v25.4S, v23.4S, v7.s[1] -add v22.4s, v22.4s, v24.4s -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v19.4s -sqrdmulh v24.4S, v14.4S, v7.s[0] -add v10.4s, v10.4s, v19.4s -mul v12.4S, v12.4S,v1.s[3] -sub v19.4s, v21.4s, v4.4s -mul v27.4S, v27.4S,v1.s[2] -add v21.4s, v21.4s, v4.4s -mla v12.4S, v15.4S, v31.s[0] -str q16, [x0, #1008] -mla v27.4S, v3.4S, v31.s[0] -str q8, [x0, #944] -mul v23.4S, v23.4S,v1.s[1] -str q0, [x0, #880] -mul v14.4S, v14.4S,v1.s[0] -str q22, [x0, #816] -mla v23.4S, v25.4S, v31.s[0] -str q5, [x0, #752] -mla v14.4S, v24.4S, v31.s[0] -str q10, [x0, #688] -ldr q10, [x0, #960] -sqrdmulh v24.4S, v10.4S, v29.s[0] -str q19, [x0, #624] -mul v10.4S, v10.4S,v30.s[0] -sub v19.4s, v2.4s, v12.4s -ldr q5, [x0, #896] -sqrdmulh v25.4S, v5.4S, v29.s[0] -str q21, [x0, #560] -mul v5.4S, v5.4S,v30.s[0] -add v2.4s, v2.4s, v12.4s -ldr q12, [x0, #832] -sqrdmulh v21.4S, v12.4S, v29.s[0] -str q19, [x0, #496] -mul v12.4S, v12.4S,v30.s[0] -sub v19.4s, v18.4s, v27.4s -ldr q22, [x0, #768] -sqrdmulh v0.4S, v22.4S, v29.s[0] -str q2, [x0, #432] -mul v22.4S, v22.4S,v30.s[0] -add v18.4s, v18.4s, v27.4s -ldr q27, [x0, #704] -mla v10.4S, v24.4S, v31.s[0] -str q19, [x0, #368] -sqrdmulh v19.4S, v27.4S, v29.s[0] -sub v24.4s, v13.4s, v23.4s -ldr q2, [x0, #640] -mla v5.4S, v25.4S, v31.s[0] -str q18, [x0, #304] -sqrdmulh v18.4S, v2.4S, v29.s[0] -add v13.4s, v13.4s, v23.4s -ldr q23, [x0, #576] -mla v12.4S, v21.4S, v31.s[0] -str q24, [x0, #240] -sqrdmulh v24.4S, v23.4S, v29.s[0] -sub v21.4s, v20.4s, v14.4s -ldr q25, [x0, #512] -mla v22.4S, v0.4S, v31.s[0] -str q13, [x0, #176] -sqrdmulh v13.4S, v25.4S, v29.s[0] -add v20.4s, v20.4s, v14.4s -ldr q14, [x0, #448] -ldr q0, [x0, #384] -mul v27.4S, v27.4S,v30.s[0] -sub v8.4s, v14.4s, v10.4s -mul v2.4S, v2.4S,v30.s[0] -add v14.4s, v14.4s, v10.4s -ldr q10, [x0, #320] -ldr q3, [x0, #256] -mla v27.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v5.4s -mla v2.4S, v18.4S, v31.s[0] -add v0.4s, v0.4s, v5.4s -ldr q5, [x0, #192] -ldr q18, [x0, #128] -mul v23.4S, v23.4S,v30.s[0] -sub v16.4s, v10.4s, v12.4s -mul v25.4S, v25.4S,v30.s[0] -add v10.4s, v10.4s, v12.4s -ldr q12, [x0, #64] -ldr q15, [x0, #0] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v22.4s -mla v25.4S, v13.4S, v31.s[0] -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v29.s[2] -nop -mul v8.4S, v8.4S,v30.s[2] -nop -sqrdmulh v13.4S, v19.4S, v29.s[2] -sub v4.4s, v5.4s, v27.4s -mul v19.4S, v19.4S,v30.s[2] -add v5.4s, v5.4s, v27.4s -sqrdmulh v27.4S, v14.4S, v29.s[1] -sub v26.4s, v18.4s, v2.4s -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v29.s[1] -sub v6.4s, v12.4s, v23.4s -mul v0.4S, v0.4S,v30.s[1] -add v12.4s, v12.4s, v23.4s -mla v8.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v25.4s -sqrdmulh v23.4S, v16.4S, v29.s[2] -add v15.4s, v15.4s, v25.4s -mla v19.4S, v13.4S, v31.s[0] -str q21, [x0, #112] -sqrdmulh v21.4S, v24.4S, v29.s[2] -nop -mla v14.4S, v27.4S, v31.s[0] -str q20, [x0, #48] -sqrdmulh v20.4S, v10.4S, v29.s[1] -nop -mla v0.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v3.4S, v29.s[1] -nop -mul v16.4S, v16.4S,v30.s[2] -sub v27.4s, v4.4s, v8.4s -mul v24.4S, v24.4S,v30.s[2] -add v4.4s, v4.4s, v8.4s -mla v16.4S, v23.4S, v31.s[0] -sub v23.4s, v26.4s, v19.4s -mla v24.4S, v21.4S, v31.s[0] -add v26.4s, v26.4s, v19.4s -mul v10.4S, v10.4S,v30.s[1] -sub v19.4s, v5.4s, v14.4s -mul v3.4S, v3.4S,v30.s[1] -add v5.4s, v5.4s, v14.4s -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v0.4s -mla v3.4S, v2.4S, v31.s[0] -add v18.4s, v18.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v11.s[3] -nop -mul v27.4S, v27.4S,v17.s[3] -nop -sqrdmulh v2.4S, v4.4S, v11.s[2] -sub v14.4s, v6.4s, v16.4s -mul v4.4S, v4.4S,v17.s[2] -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v19.4S, v11.s[1] -sub v21.4s, v22.4s, v24.4s -mul v19.4S, v19.4S,v17.s[1] -add v22.4s, v22.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v11.s[0] -sub v8.4s, v12.4s, v10.4s -mul v5.4S, v5.4S,v17.s[0] -add v12.4s, v12.4s, v10.4s -mla v27.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v3.4s -sqrdmulh v10.4S, v23.4S, v11.s[3] -add v15.4s, v15.4s, v3.4s -mla v4.4S, v2.4S, v31.s[0] -nop -sqrdmulh v2.4S, v26.4S, v11.s[2] -nop -mla v19.4S, v16.4S, v31.s[0] -nop -sqrdmulh v16.4S, v20.4S, v11.s[1] -nop -mla v5.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v18.4S, v11.s[0] -nop -mul v23.4S, v23.4S,v17.s[3] -sub v3.4s, v14.4s, v27.4s -mul v26.4S, v26.4S,v17.s[2] -add v14.4s, v14.4s, v27.4s -mla v23.4S, v10.4S, v31.s[0] -sub v10.4s, v6.4s, v4.4s -mla v26.4S, v2.4S, v31.s[0] -add v6.4s, v6.4s, v4.4s -mul v20.4S, v20.4S,v17.s[1] -sub v4.4s, v8.4s, v19.4s -mul v18.4S, v18.4S,v17.s[0] -add v8.4s, v8.4s, v19.4s -mla v20.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v5.4s -mla v18.4S, v24.4S, v31.s[0] -add v12.4s, v12.4s, v5.4s -sqrdmulh v5.4S, v3.4S, v9.s[3] -nop -mul v3.4S, v3.4S,v28.s[3] -nop -sqrdmulh v24.4S, v14.4S, v9.s[2] -sub v19.4s, v21.4s, v23.4s -mul v14.4S, v14.4S,v28.s[2] -add v21.4s, v21.4s, v23.4s -sqrdmulh v23.4S, v10.4S, v9.s[1] -sub v2.4s, v22.4s, v26.4s -mul v10.4S, v10.4S,v28.s[1] -add v22.4s, v22.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[0] -sub v27.4s, v0.4s, v20.4s -mul v6.4S, v6.4S,v28.s[0] -add v0.4s, v0.4s, v20.4s -mla v3.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v18.4s -sqrdmulh v20.4S, v4.4S, v7.s[3] -add v15.4s, v15.4s, v18.4s -mla v14.4S, v24.4S, v31.s[0] -sub v24.4s, v19.4s, v3.4s -sqrdmulh v18.4S, v8.4S, v7.s[2] -add v19.4s, v19.4s, v3.4s -mla v10.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v14.4s -sqrdmulh v3.4S, v16.4S, v7.s[1] -add v21.4s, v21.4s, v14.4s -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v10.4s -sqrdmulh v14.4S, v12.4S, v7.s[0] -add v2.4s, v2.4s, v10.4s -mul v4.4S, v4.4S,v1.s[3] -sub v10.4s, v22.4s, v6.4s -mul v8.4S, v8.4S,v1.s[2] -add v22.4s, v22.4s, v6.4s -mla v4.4S, v20.4S, v31.s[0] -str q24, [x0, #960] -mla v8.4S, v18.4S, v31.s[0] -str q19, [x0, #896] -mul v16.4S, v16.4S,v1.s[1] -str q23, [x0, #832] -mul v12.4S, v12.4S,v1.s[0] -str q21, [x0, #768] -mla v16.4S, v3.4S, v31.s[0] -str q26, [x0, #704] -mla v12.4S, v14.4S, v31.s[0] -str q2, [x0, #640] -ldr q2, [x0, #976] -sqrdmulh v14.4S, v2.4S, v29.s[0] -str q10, [x0, #576] -mul v2.4S, v2.4S,v30.s[0] -sub v10.4s, v27.4s, v4.4s -ldr q26, [x0, #912] -sqrdmulh v3.4S, v26.4S, v29.s[0] -str q22, [x0, #512] -mul v26.4S, v26.4S,v30.s[0] -add v27.4s, v27.4s, v4.4s -ldr q4, [x0, #848] -sqrdmulh v22.4S, v4.4S, v29.s[0] -str q10, [x0, #448] -mul v4.4S, v4.4S,v30.s[0] -sub v10.4s, v0.4s, v8.4s -ldr q21, [x0, #784] -sqrdmulh v23.4S, v21.4S, v29.s[0] -str q27, [x0, #384] -mul v21.4S, v21.4S,v30.s[0] -add v0.4s, v0.4s, v8.4s -ldr q8, [x0, #720] -mla v2.4S, v14.4S, v31.s[0] -str q10, [x0, #320] -sqrdmulh v10.4S, v8.4S, v29.s[0] -sub v14.4s, v5.4s, v16.4s -ldr q27, [x0, #656] -mla v26.4S, v3.4S, v31.s[0] -str q0, [x0, #256] -sqrdmulh v0.4S, v27.4S, v29.s[0] -add v5.4s, v5.4s, v16.4s -ldr q16, [x0, #592] -mla v4.4S, v22.4S, v31.s[0] -str q14, [x0, #192] -sqrdmulh v14.4S, v16.4S, v29.s[0] -sub v22.4s, v15.4s, v12.4s -ldr q3, [x0, #528] -mla v21.4S, v23.4S, v31.s[0] -str q5, [x0, #128] -sqrdmulh v5.4S, v3.4S, v29.s[0] -add v15.4s, v15.4s, v12.4s -ldr q12, [x0, #464] -ldr q23, [x0, #400] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v12.4s, v2.4s -mul v27.4S, v27.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #336] -ldr q18, [x0, #272] -mla v8.4S, v10.4S, v31.s[0] -sub v10.4s, v23.4s, v26.4s -mla v27.4S, v0.4S, v31.s[0] -add v23.4s, v23.4s, v26.4s -ldr q26, [x0, #208] -ldr q0, [x0, #144] -mul v16.4S, v16.4S,v30.s[0] -sub v24.4s, v2.4s, v4.4s -mul v3.4S, v3.4S,v30.s[0] -add v2.4s, v2.4s, v4.4s -ldr q4, [x0, #80] -ldr q20, [x0, #16] -mla v16.4S, v14.4S, v31.s[0] -sub v14.4s, v18.4s, v21.4s -mla v3.4S, v5.4S, v31.s[0] -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v29.s[2] -nop -mul v19.4S, v19.4S,v30.s[2] -nop -sqrdmulh v5.4S, v10.4S, v29.s[2] -sub v6.4s, v26.4s, v8.4s -mul v10.4S, v10.4S,v30.s[2] -add v26.4s, v26.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v29.s[1] -sub v13.4s, v0.4s, v27.4s -mul v12.4S, v12.4S,v30.s[1] -add v0.4s, v0.4s, v27.4s -sqrdmulh v27.4S, v23.4S, v29.s[1] -sub v25.4s, v4.4s, v16.4s -mul v23.4S, v23.4S,v30.s[1] -add v4.4s, v4.4s, v16.4s -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v3.4s -sqrdmulh v16.4S, v24.4S, v29.s[2] -add v20.4s, v20.4s, v3.4s -mla v10.4S, v5.4S, v31.s[0] -str q22, [x0, #64] -sqrdmulh v22.4S, v14.4S, v29.s[2] -nop -mla v12.4S, v8.4S, v31.s[0] -str q15, [x0, #0] -sqrdmulh v15.4S, v2.4S, v29.s[1] -nop -mla v23.4S, v27.4S, v31.s[0] -nop -sqrdmulh v27.4S, v18.4S, v29.s[1] -nop -mul v24.4S, v24.4S,v30.s[2] -sub v8.4s, v6.4s, v19.4s -mul v14.4S, v14.4S,v30.s[2] -add v6.4s, v6.4s, v19.4s -mla v24.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v10.4s -mla v14.4S, v22.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -mul v2.4S, v2.4S,v30.s[1] -sub v10.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v30.s[1] -add v26.4s, v26.4s, v12.4s -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v23.4s -mla v18.4S, v27.4S, v31.s[0] -add v0.4s, v0.4s, v23.4s -sqrdmulh v29.4S, v8.4S, v11.s[3] -nop -mul v8.4S, v8.4S,v17.s[3] -nop -sqrdmulh v30.4S, v6.4S, v11.s[2] -sub v23.4s, v25.4s, v24.4s -mul v6.4S, v6.4S,v17.s[2] -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v10.4S, v11.s[1] -sub v27.4s, v21.4s, v14.4s -mul v10.4S, v10.4S,v17.s[1] -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v11.s[0] -sub v12.4s, v4.4s, v2.4s -mul v26.4S, v26.4S,v17.s[0] -add v4.4s, v4.4s, v2.4s -mla v8.4S, v29.4S, v31.s[0] -sub v29.4s, v20.4s, v18.4s -sqrdmulh v2.4S, v16.4S, v11.s[3] -add v20.4s, v20.4s, v18.4s -mla v6.4S, v30.4S, v31.s[0] -nop -sqrdmulh v30.4S, v13.4S, v11.s[2] -nop -mla v10.4S, v24.4S, v31.s[0] -nop -sqrdmulh v24.4S, v15.4S, v11.s[1] -nop -mla v26.4S, v14.4S, v31.s[0] -nop -sqrdmulh v14.4S, v0.4S, v11.s[0] -nop -mul v16.4S, v16.4S,v17.s[3] -sub v18.4s, v23.4s, v8.4s -mul v13.4S, v13.4S,v17.s[2] -add v23.4s, v23.4s, v8.4s -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v6.4s -mla v13.4S, v30.4S, v31.s[0] -add v25.4s, v25.4s, v6.4s -mul v15.4S, v15.4S,v17.s[1] -sub v6.4s, v12.4s, v10.4s -mul v0.4S, v0.4S,v17.s[0] -add v12.4s, v12.4s, v10.4s -mla v15.4S, v24.4S, v31.s[0] -sub v24.4s, v4.4s, v26.4s -mla v0.4S, v14.4S, v31.s[0] -add v4.4s, v4.4s, v26.4s -sqrdmulh v11.4S, v18.4S, v9.s[3] -nop -mul v18.4S, v18.4S,v28.s[3] -nop -sqrdmulh v17.4S, v23.4S, v9.s[2] -sub v26.4s, v27.4s, v16.4s -mul v23.4S, v23.4S,v28.s[2] -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v2.4S, v9.s[1] -sub v14.4s, v21.4s, v13.4s -mul v2.4S, v2.4S,v28.s[1] -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v9.s[0] -sub v10.4s, v29.4s, v15.4s -mul v25.4S, v25.4S,v28.s[0] -add v29.4s, v29.4s, v15.4s -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v0.4s -sqrdmulh v9.4S, v6.4S, v7.s[3] -add v20.4s, v20.4s, v0.4s -mla v23.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v18.4s -sqrdmulh v0.4S, v12.4S, v7.s[2] -add v26.4s, v26.4s, v18.4s -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v27.4s, v23.4s -sqrdmulh v18.4S, v24.4S, v7.s[1] -add v27.4s, v27.4s, v23.4s -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v2.4s -sqrdmulh v23.4S, v4.4S, v7.s[0] -add v14.4s, v14.4s, v2.4s -mul v6.4S, v6.4S,v1.s[3] -sub v2.4s, v21.4s, v25.4s -mul v12.4S, v12.4S,v1.s[2] -add v21.4s, v21.4s, v25.4s -mla v6.4S, v9.4S, v31.s[0] -str q17, [x0, #976] -mla v12.4S, v0.4S, v31.s[0] -str q26, [x0, #912] -mul v24.4S, v24.4S,v1.s[1] -str q16, [x0, #848] -mul v4.4S, v4.4S,v1.s[0] -str q27, [x0, #784] -mla v24.4S, v18.4S, v31.s[0] -str q13, [x0, #720] -mla v4.4S, v23.4S, v31.s[0] -str q14, [x0, #656] -str q2, [x0, #592] -sub v2.4s, v10.4s, v6.4s -str q21, [x0, #528] -add v10.4s, v10.4s, v6.4s -str q2, [x0, #464] -sub v2.4s, v29.4s, v12.4s -str q10, [x0, #400] -add v29.4s, v29.4s, v12.4s -str q2, [x0, #336] -sub v2.4s, v11.4s, v24.4s -str q29, [x0, #272] -add v11.4s, v11.4s, v24.4s -str q2, [x0, #208] -sub v2.4s, v20.4s, v4.4s -str q11, [x0, #144] -add v20.4s, v20.4s, v4.4s -str q2, [x0, #80] -str q20, [x0, #16] -ldr q3, [x0, #224] -ldr q5, [x0, #160] -ldr q19, [x0, #32] -ldr q22, [x17, #+128] -ldr q8, [x17, #+144] -sqrdmulh v30.4S, v19.4S, v8.s[0] -mul v19.4S, v19.4S,v22.s[0] -ldr q15, [x0, #48] -sqrdmulh v28.4S, v15.4S, v8.s[0] -mul v15.4S, v15.4S,v22.s[0] -ldr q25, [x17, #+160] -ldr q9, [x17, #+176] -ldr q17, [x0, #96] -sqrdmulh v0.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v25.s[0] -ldr q26, [x0, #112] -sqrdmulh v16.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v25.s[0] -ldr q27, [x17, #+192] -ldr q18, [x17, #+208] -mla v19.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v5.4S, v18.s[0] -ldr q13, [x0, #176] -mla v15.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v13.4S, v18.s[0] -ldr q23, [x17, #+224] -ldr q14, [x17, #+240] -mla v17.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v3.4S, v14.s[0] -ldr q1, [x0, #240] -mla v26.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v1.4S, v14.s[0] -ldr q7, [x0, #0] -ldr q21, [x0, #128] -mul v5.4S, v5.4S,v27.s[0] -sub v6.4s, v7.4s, v19.4s -ldr q10, [x0, #16] -mul v13.4S, v13.4S,v27.s[0] -add v7.4s, v7.4s, v19.4s -ldr q19, [x0, #144] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v15.4s -ldr q12, [x0, #64] -mla v13.4S, v28.4S, v31.s[0] -add v10.4s, v10.4s, v15.4s -ldr q15, [x0, #192] -mul v3.4S, v3.4S,v23.s[0] -sub v28.4s, v12.4s, v17.4s -ldr q29, [x0, #80] -mul v1.4S, v1.4S,v23.s[0] -add v12.4s, v12.4s, v17.4s -ldr q17, [x0, #208] -mla v3.4S, v0.4S, v31.s[0] -mla v1.4S, v16.4S, v31.s[0] -sub v16.4s, v29.4s, v26.4s -sqrdmulh v0.4S, v10.4S, v8.s[1] -add v29.4s, v29.4s, v26.4s -mul v10.4S, v10.4S,v22.s[1] -sqrdmulh v26.4S, v30.4S, v8.s[2] -sub v24.4s, v21.4s, v5.4s -mul v30.4S, v30.4S,v22.s[2] -add v21.4s, v21.4s, v5.4s -sqrdmulh v8.4S, v29.4S, v9.s[1] -sub v22.4s, v19.4s, v13.4s -mul v29.4S, v29.4S,v25.s[1] -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v9.s[2] -sub v5.4s, v15.4s, v3.4s -mul v16.4S, v16.4S,v25.s[2] -add v15.4s, v15.4s, v3.4s -mla v10.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v1.4s -ldr q9, [x0, #480] -sqrdmulh v25.4S, v19.4S, v18.s[1] -add v17.4s, v17.4s, v1.4s -mla v30.4S, v26.4S, v31.s[0] -ldr q26, [x0, #416] -sqrdmulh v1.4S, v22.4S, v18.s[2] -sub v3.4s, v7.4s, v10.4s -mla v29.4S, v8.4S, v31.s[0] -ldr q8, [x0, #288] -sqrdmulh v11.4S, v17.4S, v14.s[1] -add v7.4s, v7.4s, v10.4s -str q3, [x0, #16] -mla v16.4S, v13.4S, v31.s[0] -ldr q13, [x17, #+256] -ldr q3, [x17, #+272] -sqrdmulh v10.4S, v0.4S, v14.s[2] -sub v4.4s, v6.4s, v30.4s -str q7, [x0, #0] -mul v19.4S, v19.4S,v27.s[1] -add v6.4s, v6.4s, v30.4s -mul v22.4S, v22.4S,v27.s[2] -str q4, [x0, #48] -mla v19.4S, v25.4S, v31.s[0] -sub v25.4s, v12.4s, v29.4s -mla v22.4S, v1.4S, v31.s[0] -str q6, [x0, #32] -mul v17.4S, v17.4S,v23.s[1] -str q25, [x0, #80] -mul v0.4S, v0.4S,v23.s[2] -add v12.4s, v12.4s, v29.4s -str q12, [x0, #64] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v16.4s -str q11, [x0, #112] -mla v0.4S, v10.4S, v31.s[0] -add v28.4s, v28.4s, v16.4s -str q28, [x0, #96] -sqrdmulh v14.4S, v8.4S, v3.s[0] -sub v23.4s, v21.4s, v19.4s -mul v8.4S, v8.4S,v13.s[0] -str q23, [x0, #144] -ldr q23, [x0, #304] -sqrdmulh v28.4S, v23.4S, v3.s[0] -add v21.4s, v21.4s, v19.4s -mul v23.4S, v23.4S,v13.s[0] -str q21, [x0, #128] -ldr q21, [x17, #+288] -ldr q19, [x17, #+304] -ldr q16, [x0, #352] -sqrdmulh v10.4S, v16.4S, v19.s[0] -sub v11.4s, v24.4s, v22.4s -mul v16.4S, v16.4S,v21.s[0] -str q11, [x0, #176] -ldr q11, [x0, #368] -sqrdmulh v12.4S, v11.4S, v19.s[0] -add v24.4s, v24.4s, v22.4s -mul v11.4S, v11.4S,v21.s[0] -str q24, [x0, #160] -ldr q24, [x17, #+320] -ldr q22, [x17, #+336] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v17.4s -sqrdmulh v29.4S, v26.4S, v22.s[0] -str q14, [x0, #208] -ldr q14, [x0, #432] -mla v23.4S, v28.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v22.s[0] -str q15, [x0, #192] -ldr q15, [x17, #+352] -ldr q28, [x17, #+368] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v0.4s -sqrdmulh v25.4S, v9.4S, v28.s[0] -str q10, [x0, #240] -ldr q10, [x0, #496] -mla v11.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v0.4s -sqrdmulh v0.4S, v10.4S, v28.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q12, [x0, #384] -mul v26.4S, v26.4S,v24.s[0] -sub v18.4s, v5.4s, v8.4s -ldr q27, [x0, #272] -mul v14.4S, v14.4S,v24.s[0] -add v5.4s, v5.4s, v8.4s -ldr q8, [x0, #400] -mla v26.4S, v29.4S, v31.s[0] -sub v29.4s, v27.4s, v23.4s -ldr q6, [x0, #320] -mla v14.4S, v17.4S, v31.s[0] -add v27.4s, v27.4s, v23.4s -ldr q23, [x0, #448] -mul v9.4S, v9.4S,v15.s[0] -sub v17.4s, v6.4s, v16.4s -ldr q1, [x0, #336] -mul v10.4S, v10.4S,v15.s[0] -add v6.4s, v6.4s, v16.4s -ldr q16, [x0, #464] -mla v9.4S, v25.4S, v31.s[0] -mla v10.4S, v0.4S, v31.s[0] -sub v0.4s, v1.4s, v11.4s -sqrdmulh v25.4S, v27.4S, v3.s[1] -add v1.4s, v1.4s, v11.4s -mul v27.4S, v27.4S,v13.s[1] -sqrdmulh v11.4S, v29.4S, v3.s[2] -sub v4.4s, v12.4s, v26.4s -mul v29.4S, v29.4S,v13.s[2] -add v12.4s, v12.4s, v26.4s -sqrdmulh v3.4S, v1.4S, v19.s[1] -sub v13.4s, v8.4s, v14.4s -mul v1.4S, v1.4S,v21.s[1] -add v8.4s, v8.4s, v14.4s -sqrdmulh v14.4S, v0.4S, v19.s[2] -sub v26.4s, v23.4s, v9.4s -mul v0.4S, v0.4S,v21.s[2] -add v23.4s, v23.4s, v9.4s -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v16.4s, v10.4s -ldr q19, [x0, #736] -sqrdmulh v21.4S, v8.4S, v22.s[1] -add v16.4s, v16.4s, v10.4s -mla v29.4S, v11.4S, v31.s[0] -ldr q11, [x0, #672] -sqrdmulh v10.4S, v13.4S, v22.s[2] -sub v9.4s, v5.4s, v27.4s -mla v1.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v30.4S, v16.4S, v28.s[1] -add v5.4s, v5.4s, v27.4s -str q9, [x0, #272] -mla v0.4S, v14.4S, v31.s[0] -ldr q14, [x17, #+384] -ldr q9, [x17, #+400] -sqrdmulh v27.4S, v25.4S, v28.s[2] -sub v7.4s, v18.4s, v29.4s -str q5, [x0, #256] -mul v8.4S, v8.4S,v24.s[1] -add v18.4s, v18.4s, v29.4s -mul v13.4S, v13.4S,v24.s[2] -str q7, [x0, #304] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v6.4s, v1.4s -mla v13.4S, v10.4S, v31.s[0] -str q18, [x0, #288] -mul v16.4S, v16.4S,v15.s[1] -str q21, [x0, #336] -mul v25.4S, v25.4S,v15.s[2] -add v6.4s, v6.4s, v1.4s -str q6, [x0, #320] -mla v16.4S, v30.4S, v31.s[0] -sub v30.4s, v17.4s, v0.4s -str q30, [x0, #368] -mla v25.4S, v27.4S, v31.s[0] -add v17.4s, v17.4s, v0.4s -str q17, [x0, #352] -sqrdmulh v28.4S, v3.4S, v9.s[0] -sub v15.4s, v12.4s, v8.4s -mul v3.4S, v3.4S,v14.s[0] -str q15, [x0, #400] -ldr q15, [x0, #560] -sqrdmulh v17.4S, v15.4S, v9.s[0] -add v12.4s, v12.4s, v8.4s -mul v15.4S, v15.4S,v14.s[0] -str q12, [x0, #384] -ldr q12, [x17, #+416] -ldr q8, [x17, #+432] -ldr q0, [x0, #608] -sqrdmulh v27.4S, v0.4S, v8.s[0] -sub v30.4s, v4.4s, v13.4s -mul v0.4S, v0.4S,v12.s[0] -str q30, [x0, #432] -ldr q30, [x0, #624] -sqrdmulh v6.4S, v30.4S, v8.s[0] -add v4.4s, v4.4s, v13.4s -mul v30.4S, v30.4S,v12.s[0] -str q4, [x0, #416] -ldr q4, [x17, #+448] -ldr q13, [x17, #+464] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v23.4s, v16.4s -sqrdmulh v1.4S, v11.4S, v13.s[0] -str q28, [x0, #464] -ldr q28, [x0, #688] -mla v15.4S, v17.4S, v31.s[0] -add v23.4s, v23.4s, v16.4s -sqrdmulh v16.4S, v28.4S, v13.s[0] -str q23, [x0, #448] -ldr q23, [x17, #+480] -ldr q17, [x17, #+496] -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v25.4s -sqrdmulh v21.4S, v19.4S, v17.s[0] -str q27, [x0, #496] -ldr q27, [x0, #752] -mla v30.4S, v6.4S, v31.s[0] -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v27.4S, v17.s[0] -str q26, [x0, #480] -ldr q26, [x0, #512] -ldr q6, [x0, #640] -mul v11.4S, v11.4S,v4.s[0] -sub v22.4s, v26.4s, v3.4s -ldr q24, [x0, #528] -mul v28.4S, v28.4S,v4.s[0] -add v26.4s, v26.4s, v3.4s -ldr q3, [x0, #656] -mla v11.4S, v1.4S, v31.s[0] -sub v1.4s, v24.4s, v15.4s -ldr q18, [x0, #576] -mla v28.4S, v16.4S, v31.s[0] -add v24.4s, v24.4s, v15.4s -ldr q15, [x0, #704] -mul v19.4S, v19.4S,v23.s[0] -sub v16.4s, v18.4s, v0.4s -ldr q10, [x0, #592] -mul v27.4S, v27.4S,v23.s[0] -add v18.4s, v18.4s, v0.4s -ldr q0, [x0, #720] -mla v19.4S, v21.4S, v31.s[0] -mla v27.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v30.4s -sqrdmulh v21.4S, v24.4S, v9.s[1] -add v10.4s, v10.4s, v30.4s -mul v24.4S, v24.4S,v14.s[1] -sqrdmulh v30.4S, v1.4S, v9.s[2] -sub v7.4s, v6.4s, v11.4s -mul v1.4S, v1.4S,v14.s[2] -add v6.4s, v6.4s, v11.4s -sqrdmulh v9.4S, v10.4S, v8.s[1] -sub v14.4s, v3.4s, v28.4s -mul v10.4S, v10.4S,v12.s[1] -add v3.4s, v3.4s, v28.4s -sqrdmulh v28.4S, v25.4S, v8.s[2] -sub v11.4s, v15.4s, v19.4s -mul v25.4S, v25.4S,v12.s[2] -add v15.4s, v15.4s, v19.4s -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v27.4s -ldr q8, [x0, #992] -sqrdmulh v12.4S, v3.4S, v13.s[1] -add v0.4s, v0.4s, v27.4s -mla v1.4S, v30.4S, v31.s[0] -ldr q30, [x0, #928] -sqrdmulh v27.4S, v14.4S, v13.s[2] -sub v19.4s, v26.4s, v24.4s -mla v10.4S, v9.4S, v31.s[0] -ldr q9, [x0, #800] -sqrdmulh v29.4S, v0.4S, v17.s[1] -add v26.4s, v26.4s, v24.4s -str q19, [x0, #528] -mla v25.4S, v28.4S, v31.s[0] -ldr q28, [x17, #+512] -ldr q19, [x17, #+528] -sqrdmulh v24.4S, v21.4S, v17.s[2] -sub v5.4s, v22.4s, v1.4s -str q26, [x0, #512] -mul v3.4S, v3.4S,v4.s[1] -add v22.4s, v22.4s, v1.4s -mul v14.4S, v14.4S,v4.s[2] -str q5, [x0, #560] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v10.4s -mla v14.4S, v27.4S, v31.s[0] -str q22, [x0, #544] -mul v0.4S, v0.4S,v23.s[1] -str q12, [x0, #592] -mul v21.4S, v21.4S,v23.s[2] -add v18.4s, v18.4s, v10.4s -str q18, [x0, #576] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v25.4s -str q29, [x0, #624] -mla v21.4S, v24.4S, v31.s[0] -add v16.4s, v16.4s, v25.4s -str q16, [x0, #608] -sqrdmulh v17.4S, v9.4S, v19.s[0] -sub v23.4s, v6.4s, v3.4s -mul v9.4S, v9.4S,v28.s[0] -str q23, [x0, #656] -ldr q23, [x0, #816] -sqrdmulh v16.4S, v23.4S, v19.s[0] -add v6.4s, v6.4s, v3.4s -mul v23.4S, v23.4S,v28.s[0] -str q6, [x0, #640] -ldr q6, [x17, #+544] -ldr q3, [x17, #+560] -ldr q25, [x0, #864] -sqrdmulh v24.4S, v25.4S, v3.s[0] -sub v29.4s, v7.4s, v14.4s -mul v25.4S, v25.4S,v6.s[0] -str q29, [x0, #688] -ldr q29, [x0, #880] -sqrdmulh v18.4S, v29.4S, v3.s[0] -add v7.4s, v7.4s, v14.4s -mul v29.4S, v29.4S,v6.s[0] -str q7, [x0, #672] -ldr q7, [x17, #+576] -ldr q14, [x17, #+592] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v15.4s, v0.4s -sqrdmulh v10.4S, v30.4S, v14.s[0] -str q17, [x0, #720] -ldr q17, [x0, #944] -mla v23.4S, v16.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v14.s[0] -str q15, [x0, #704] -ldr q15, [x17, #+608] -ldr q16, [x17, #+624] -mla v25.4S, v24.4S, v31.s[0] -sub v24.4s, v11.4s, v21.4s -sqrdmulh v12.4S, v8.4S, v16.s[0] -str q24, [x0, #752] -ldr q24, [x0, #1008] -mla v29.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v24.4S, v16.s[0] -str q11, [x0, #736] -ldr q11, [x0, #768] -ldr q18, [x0, #896] -mul v30.4S, v30.4S,v7.s[0] -sub v13.4s, v11.4s, v9.4s -ldr q4, [x0, #784] -mul v17.4S, v17.4S,v7.s[0] -add v11.4s, v11.4s, v9.4s -ldr q9, [x0, #912] -mla v30.4S, v10.4S, v31.s[0] -sub v10.4s, v4.4s, v23.4s -ldr q22, [x0, #832] -mla v17.4S, v0.4S, v31.s[0] -add v4.4s, v4.4s, v23.4s -ldr q23, [x0, #960] -mul v8.4S, v8.4S,v15.s[0] -sub v0.4s, v22.4s, v25.4s -ldr q27, [x0, #848] -mul v24.4S, v24.4S,v15.s[0] -add v22.4s, v22.4s, v25.4s -ldr q25, [x0, #976] -mla v8.4S, v12.4S, v31.s[0] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v27.4s, v29.4s -sqrdmulh v12.4S, v4.4S, v19.s[1] -add v27.4s, v27.4s, v29.4s -mul v4.4S, v4.4S,v28.s[1] -sqrdmulh v29.4S, v10.4S, v19.s[2] -sub v5.4s, v18.4s, v30.4s -mul v10.4S, v10.4S,v28.s[2] -add v18.4s, v18.4s, v30.4s -sqrdmulh v19.4S, v27.4S, v3.s[1] -sub v28.4s, v9.4s, v17.4s -mul v27.4S, v27.4S,v6.s[1] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v3.s[2] -sub v30.4s, v23.4s, v8.4s -mul v21.4S, v21.4S,v6.s[2] -add v23.4s, v23.4s, v8.4s -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v25.4s, v24.4s -sqrdmulh v3.4S, v9.4S, v14.s[1] -add v25.4s, v25.4s, v24.4s -mla v10.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v28.4S, v14.s[2] -sub v24.4s, v11.4s, v4.4s -mla v27.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v25.4S, v16.s[1] -add v11.4s, v11.4s, v4.4s -str q24, [x0, #784] -mla v21.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v12.4S, v16.s[2] -sub v24.4s, v13.4s, v10.4s -str q11, [x0, #768] -mul v9.4S, v9.4S,v7.s[1] -add v13.4s, v13.4s, v10.4s -mul v28.4S, v28.4S,v7.s[2] -str q24, [x0, #816] -mla v9.4S, v3.4S, v31.s[0] -sub v3.4s, v22.4s, v27.4s -mla v28.4S, v29.4S, v31.s[0] -str q13, [x0, #800] -mul v25.4S, v25.4S,v15.s[1] -str q3, [x0, #848] -mul v12.4S, v12.4S,v15.s[2] -add v22.4s, v22.4s, v27.4s -str q22, [x0, #832] -mla v25.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v21.4s -str q19, [x0, #880] -mla v12.4S, v17.4S, v31.s[0] -add v0.4s, v0.4s, v21.4s -str q0, [x0, #864] -sub v16.4s, v18.4s, v9.4s -str q16, [x0, #912] -add v18.4s, v18.4s, v9.4s -str q18, [x0, #896] -sub v18.4s, v5.4s, v28.4s -str q18, [x0, #944] -add v5.4s, v5.4s, v28.4s -str q5, [x0, #928] -sub v5.4s, v23.4s, v25.4s -str q5, [x0, #976] -add v23.4s, v23.4s, v25.4s -str q23, [x0, #960] -sub v23.4s, v30.4s, v12.4s -str q23, [x0, #1008] -add v30.4s, v30.4s, v12.4s -str q30, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1528 -// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s deleted file mode 100644 index 42ecff5..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_18_z4_7.s +++ /dev/null @@ -1,1558 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_18_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -ldr q3, [x0, #416] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v2.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -ldr q28, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -sub v0.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v26.4s, v20.4s -nop -sqrdmulh v14.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -add v26.4s, v26.4s, v20.4s -nop -sqrdmulh v20.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -sqrdmulh v25.4S, v0.4S, v29.s[2] -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v17.4S, v29.s[2] -mla v19.4S, v14.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -nop -sqrdmulh v23.4S, v28.4S, v29.s[1] -mla v21.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v1.4S, v29.s[1] -mla v3.4S, v27.4S, v31.s[0] -nop -nop -ldr q27, [x17, #+32] -ldr q14, [x17, #+48] -mul v17.4S, v17.4S,v30.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v10.4s, v16.4s, v2.4s -add v16.4s, v16.4s, v2.4s -mla v17.4S, v11.4S, v31.s[0] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -mul v1.4S, v1.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v19.4s, v26.4s, v21.4s -add v26.4s, v26.4s, v21.4s -mla v1.4S, v20.4S, v31.s[0] -mla v28.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v14.s[3] -mul v10.4S, v10.4S,v27.s[3] -nop -nop -sqrdmulh v20.4S, v16.4S, v14.s[2] -mul v16.4S, v16.4S,v27.s[2] -sub v21.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v14.s[1] -mul v19.4S, v19.4S,v27.s[1] -sub v11.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v26.4S, v14.s[0] -mul v26.4S, v26.4S,v27.s[0] -sub v2.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -ldr q28, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v25.4S, v14.s[3] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v14.s[2] -mla v16.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v23.4S, v14.s[1] -mla v19.4S, v0.4S, v31.s[0] -nop -nop -sqrdmulh v0.4S, v18.4S, v14.s[0] -mla v26.4S, v17.4S, v31.s[0] -nop -nop -ldr q17, [x17, #+64] -ldr q7, [x17, #+80] -mul v13.4S, v13.4S,v27.s[2] -mul v25.4S, v25.4S,v27.s[3] -sub v6.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v1.4S, v31.s[0] -mla v25.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -mul v18.4S, v18.4S,v27.s[0] -mul v23.4S, v23.4S,v27.s[1] -sub v16.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -mla v18.4S, v0.4S, v31.s[0] -mla v23.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v28.s[3] -nop -nop -sqrdmulh v0.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v19.4s, v11.4s, v25.4s -add v11.4s, v11.4s, v25.4s -sqrdmulh v25.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v1.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v12.4S, v9.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v10.4s, v3.4s, v23.4s -add v3.4s, v3.4s, v23.4s -sqrdmulh v23.4S, v16.4S, v7.s[3] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v7.s[2] -mla v21.4S, v0.4S, v31.s[0] -sub v0.4s, v19.4s, v6.4s -str q0, [x0, #992] -sqrdmulh v0.4S, v20.4S, v7.s[1] -mla v8.4S, v25.4S, v31.s[0] -add v19.4s, v19.4s, v6.4s -str q19, [x0, #928] -sqrdmulh v19.4S, v24.4S, v7.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v21.4s -str q13, [x0, #864] -mul v2.4S, v2.4S,v17.s[2] -mul v16.4S, v16.4S,v17.s[3] -add v11.4s, v11.4s, v21.4s -sub v21.4s, v1.4s, v8.4s -mla v2.4S, v18.4S, v31.s[0] -mla v16.4S, v23.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -str q11, [x0, #800] -mul v24.4S, v24.4S,v17.s[0] -mul v20.4S, v20.4S,v17.s[1] -sub v11.4s, v22.4s, v12.4s -str q21, [x0, #736] -mla v24.4S, v19.4S, v31.s[0] -mla v20.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -str q1, [x0, #672] -ldr q1, [x0, #1008] -sqrdmulh v12.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -str q11, [x0, #608] -sub v11.4s, v10.4s, v16.4s -ldr q0, [x0, #944] -sqrdmulh v19.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -str q22, [x0, #544] -add v10.4s, v10.4s, v16.4s -ldr q16, [x0, #880] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q11, [x0, #480] -sub v11.4s, v3.4s, v2.4s -ldr q21, [x0, #816] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -str q10, [x0, #416] -add v3.4s, v3.4s, v2.4s -ldr q2, [x0, #752] -sqrdmulh v10.4S, v2.4S, v29.s[0] -mla v1.4S, v12.4S, v31.s[0] -str q11, [x0, #352] -sub v11.4s, v26.4s, v20.4s -ldr q12, [x0, #688] -sqrdmulh v23.4S, v12.4S, v29.s[0] -mla v0.4S, v19.4S, v31.s[0] -str q3, [x0, #288] -add v26.4s, v26.4s, v20.4s -ldr q20, [x0, #624] -sqrdmulh v3.4S, v20.4S, v29.s[0] -mla v16.4S, v22.4S, v31.s[0] -str q11, [x0, #224] -sub v11.4s, v15.4s, v24.4s -ldr q22, [x0, #560] -sqrdmulh v19.4S, v22.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -str q26, [x0, #160] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #496] -ldr q26, [x0, #432] -mul v12.4S, v12.4S,v30.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v8.4s, v24.4s, v1.4s -add v24.4s, v24.4s, v1.4s -ldr q1, [x0, #368] -ldr q18, [x0, #304] -mla v12.4S, v23.4S, v31.s[0] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v26.4s, v0.4s -add v26.4s, v26.4s, v0.4s -ldr q0, [x0, #240] -ldr q23, [x0, #176] -mul v22.4S, v22.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v13.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -ldr q16, [x0, #112] -ldr q6, [x0, #48] -mla v22.4S, v19.4S, v31.s[0] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v19.4s, v0.4s, v2.4s -nop -sqrdmulh v25.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v0.4s, v0.4s, v2.4s -nop -sqrdmulh v2.4S, v24.4S, v29.s[1] -mul v24.4S, v24.4S,v30.s[1] -sub v5.4s, v23.4s, v12.4s -add v23.4s, v23.4s, v12.4s -sqrdmulh v12.4S, v26.4S, v29.s[1] -mul v26.4S, v26.4S,v30.s[1] -sub v4.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v29.s[2] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v6.4s, v22.4s -str q11, [x0, #96] -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v10.4S, v25.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -nop -sqrdmulh v22.4S, v1.4S, v29.s[1] -mla v24.4S, v2.4S, v31.s[0] -str q15, [x0, #32] -nop -sqrdmulh v15.4S, v18.4S, v29.s[1] -mla v26.4S, v12.4S, v31.s[0] -nop -nop -mul v3.4S, v3.4S,v30.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v12.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v3.4S, v11.4S, v31.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -mul v18.4S, v18.4S,v30.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v10.4s, v0.4s, v24.4s -add v0.4s, v0.4s, v24.4s -mla v18.4S, v15.4S, v31.s[0] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v23.4s, v26.4s -add v23.4s, v23.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v14.s[3] -mul v12.4S, v12.4S,v27.s[3] -nop -nop -sqrdmulh v15.4S, v19.4S, v14.s[2] -mul v19.4S, v19.4S,v27.s[2] -sub v24.4s, v4.4s, v13.4s -add v4.4s, v4.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v14.s[1] -mul v10.4S, v10.4S,v27.s[1] -sub v11.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v14.s[0] -mul v0.4S, v0.4S,v27.s[0] -sub v8.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v14.s[3] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v6.4s, v18.4s -add v6.4s, v6.4s, v18.4s -sqrdmulh v18.4S, v5.4S, v14.s[2] -mla v19.4S, v15.4S, v31.s[0] -nop -nop -sqrdmulh v15.4S, v22.4S, v14.s[1] -mla v10.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v23.4S, v14.s[0] -mla v0.4S, v3.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v27.s[2] -mul v20.4S, v20.4S,v27.s[3] -sub v3.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -mla v5.4S, v18.4S, v31.s[0] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v4.4s, v19.4s -add v4.4s, v4.4s, v19.4s -mul v23.4S, v23.4S,v27.s[0] -mul v22.4S, v22.4S,v27.s[1] -sub v19.4s, v8.4s, v10.4s -add v8.4s, v8.4s, v10.4s -mla v23.4S, v13.4S, v31.s[0] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v9.s[3] -mul v3.4S, v3.4S,v28.s[3] -nop -nop -sqrdmulh v13.4S, v24.4S, v9.s[2] -mul v24.4S, v24.4S,v28.s[2] -sub v10.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v1.4S, v9.s[1] -mul v1.4S, v1.4S,v28.s[1] -sub v18.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v9.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v12.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v7.s[3] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v6.4s, v23.4s -add v6.4s, v6.4s, v23.4s -sqrdmulh v23.4S, v8.4S, v7.s[2] -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v10.4s, v3.4s -str q13, [x0, #1008] -sqrdmulh v13.4S, v15.4S, v7.s[1] -mla v1.4S, v20.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q10, [x0, #944] -sqrdmulh v10.4S, v16.4S, v7.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v24.4s -str q5, [x0, #880] -mul v8.4S, v8.4S,v17.s[2] -mul v19.4S, v19.4S,v17.s[3] -add v11.4s, v11.4s, v24.4s -sub v24.4s, v18.4s, v1.4s -mla v8.4S, v23.4S, v31.s[0] -mla v19.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -str q11, [x0, #816] -mul v16.4S, v16.4S,v17.s[0] -mul v15.4S, v15.4S,v17.s[1] -sub v11.4s, v21.4s, v4.4s -str q24, [x0, #752] -mla v16.4S, v10.4S, v31.s[0] -mla v15.4S, v13.4S, v31.s[0] -add v21.4s, v21.4s, v4.4s -str q18, [x0, #688] -ldr q18, [x0, #960] -sqrdmulh v4.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q11, [x0, #624] -sub v11.4s, v12.4s, v19.4s -ldr q13, [x0, #896] -sqrdmulh v10.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -str q21, [x0, #560] -add v12.4s, v12.4s, v19.4s -ldr q19, [x0, #832] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -str q11, [x0, #496] -sub v11.4s, v26.4s, v8.4s -ldr q24, [x0, #768] -sqrdmulh v1.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -str q12, [x0, #432] -add v26.4s, v26.4s, v8.4s -ldr q8, [x0, #704] -sqrdmulh v12.4S, v8.4S, v29.s[0] -mla v18.4S, v4.4S, v31.s[0] -str q11, [x0, #368] -sub v11.4s, v0.4s, v15.4s -ldr q4, [x0, #640] -sqrdmulh v22.4S, v4.4S, v29.s[0] -mla v13.4S, v10.4S, v31.s[0] -str q26, [x0, #304] -add v0.4s, v0.4s, v15.4s -ldr q15, [x0, #576] -sqrdmulh v26.4S, v15.4S, v29.s[0] -mla v19.4S, v21.4S, v31.s[0] -str q11, [x0, #240] -sub v11.4s, v6.4s, v16.4s -ldr q21, [x0, #512] -sqrdmulh v10.4S, v21.4S, v29.s[0] -mla v24.4S, v1.4S, v31.s[0] -str q0, [x0, #176] -add v6.4s, v6.4s, v16.4s -ldr q16, [x0, #448] -ldr q0, [x0, #384] -mul v4.4S, v4.4S,v30.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v1.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -ldr q18, [x0, #320] -ldr q23, [x0, #256] -mla v4.4S, v22.4S, v31.s[0] -mla v8.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -ldr q13, [x0, #192] -ldr q22, [x0, #128] -mul v21.4S, v21.4S,v30.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v5.4s, v18.4s, v19.4s -add v18.4s, v18.4s, v19.4s -ldr q19, [x0, #64] -ldr q3, [x0, #0] -mla v21.4S, v10.4S, v31.s[0] -mla v15.4S, v26.4S, v31.s[0] -sub v26.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v10.4s, v13.4s, v8.4s -nop -sqrdmulh v20.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v13.4s, v13.4s, v8.4s -nop -sqrdmulh v8.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v2.4s, v22.4s, v4.4s -add v22.4s, v22.4s, v4.4s -sqrdmulh v4.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v25.4s, v19.4s, v15.4s -add v19.4s, v19.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v29.s[2] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v21.4s -str q11, [x0, #112] -sqrdmulh v11.4S, v26.4S, v29.s[2] -mla v12.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v21.4s -nop -sqrdmulh v21.4S, v18.4S, v29.s[1] -mla v16.4S, v8.4S, v31.s[0] -str q6, [x0, #48] -nop -sqrdmulh v6.4S, v23.4S, v29.s[1] -mla v0.4S, v4.4S, v31.s[0] -nop -nop -mul v26.4S, v26.4S,v30.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v4.4s, v10.4s, v1.4s -add v10.4s, v10.4s, v1.4s -mla v26.4S, v11.4S, v31.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v12.4s -add v2.4s, v2.4s, v12.4s -mul v23.4S, v23.4S,v30.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -mla v23.4S, v6.4S, v31.s[0] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v14.s[3] -mul v4.4S, v4.4S,v27.s[3] -nop -nop -sqrdmulh v6.4S, v10.4S, v14.s[2] -mul v10.4S, v10.4S,v27.s[2] -sub v16.4s, v25.4s, v5.4s -add v25.4s, v25.4s, v5.4s -sqrdmulh v5.4S, v12.4S, v14.s[1] -mul v12.4S, v12.4S,v27.s[1] -sub v11.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v13.4S, v14.s[0] -mul v13.4S, v13.4S,v27.s[0] -sub v1.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v14.s[3] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v3.4s, v23.4s -add v3.4s, v3.4s, v23.4s -sqrdmulh v23.4S, v2.4S, v14.s[2] -mla v10.4S, v6.4S, v31.s[0] -nop -nop -sqrdmulh v6.4S, v21.4S, v14.s[1] -mla v12.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v22.4S, v14.s[0] -mla v13.4S, v26.4S, v31.s[0] -nop -nop -mul v2.4S, v2.4S,v27.s[2] -mul v15.4S, v15.4S,v27.s[3] -sub v26.4s, v16.4s, v4.4s -add v16.4s, v16.4s, v4.4s -mla v2.4S, v23.4S, v31.s[0] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v10.4s -add v25.4s, v25.4s, v10.4s -mul v22.4S, v22.4S,v27.s[0] -mul v21.4S, v21.4S,v27.s[1] -sub v10.4s, v1.4s, v12.4s -add v1.4s, v1.4s, v12.4s -mla v22.4S, v5.4S, v31.s[0] -mla v21.4S, v6.4S, v31.s[0] -sub v6.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[3] -mul v26.4S, v26.4S,v28.s[3] -nop -nop -sqrdmulh v5.4S, v16.4S, v9.s[2] -mul v16.4S, v16.4S,v28.s[2] -sub v12.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v9.s[1] -mul v18.4S, v18.4S,v28.s[1] -sub v23.4s, v24.4s, v2.4s -add v24.4s, v24.4s, v2.4s -sqrdmulh v2.4S, v25.4S, v9.s[0] -mul v25.4S, v25.4S,v28.s[0] -sub v4.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v7.s[3] -mla v26.4S, v13.4S, v31.s[0] -sub v13.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v7.s[2] -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v26.4s -str q5, [x0, #960] -sqrdmulh v5.4S, v6.4S, v7.s[1] -mla v18.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -str q12, [x0, #896] -sqrdmulh v12.4S, v19.4S, v7.s[0] -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v11.4s, v16.4s -str q2, [x0, #832] -mul v1.4S, v1.4S,v17.s[2] -mul v10.4S, v10.4S,v17.s[3] -add v11.4s, v11.4s, v16.4s -sub v16.4s, v23.4s, v18.4s -mla v1.4S, v22.4S, v31.s[0] -mla v10.4S, v21.4S, v31.s[0] -add v23.4s, v23.4s, v18.4s -str q11, [x0, #768] -mul v19.4S, v19.4S,v17.s[0] -mul v6.4S, v6.4S,v17.s[1] -sub v11.4s, v24.4s, v25.4s -str q16, [x0, #704] -mla v19.4S, v12.4S, v31.s[0] -mla v6.4S, v5.4S, v31.s[0] -add v24.4s, v24.4s, v25.4s -str q23, [x0, #640] -ldr q23, [x0, #976] -sqrdmulh v25.4S, v23.4S, v29.s[0] -mul v23.4S, v23.4S,v30.s[0] -str q11, [x0, #576] -sub v11.4s, v4.4s, v10.4s -ldr q5, [x0, #912] -sqrdmulh v12.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -str q24, [x0, #512] -add v4.4s, v4.4s, v10.4s -ldr q10, [x0, #848] -sqrdmulh v24.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -str q11, [x0, #448] -sub v11.4s, v0.4s, v1.4s -ldr q16, [x0, #784] -sqrdmulh v18.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q4, [x0, #384] -add v0.4s, v0.4s, v1.4s -ldr q1, [x0, #720] -sqrdmulh v4.4S, v1.4S, v29.s[0] -mla v23.4S, v25.4S, v31.s[0] -str q11, [x0, #320] -sub v11.4s, v13.4s, v6.4s -ldr q25, [x0, #656] -sqrdmulh v21.4S, v25.4S, v29.s[0] -mla v5.4S, v12.4S, v31.s[0] -str q0, [x0, #256] -add v13.4s, v13.4s, v6.4s -ldr q6, [x0, #592] -sqrdmulh v0.4S, v6.4S, v29.s[0] -mla v10.4S, v24.4S, v31.s[0] -str q11, [x0, #192] -sub v11.4s, v3.4s, v19.4s -ldr q24, [x0, #528] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v16.4S, v18.4S, v31.s[0] -str q13, [x0, #128] -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #464] -ldr q13, [x0, #400] -mul v25.4S, v25.4S,v30.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v18.4s, v19.4s, v23.4s -add v19.4s, v19.4s, v23.4s -ldr q23, [x0, #336] -ldr q22, [x0, #272] -mla v25.4S, v21.4S, v31.s[0] -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -ldr q5, [x0, #208] -ldr q21, [x0, #144] -mul v24.4S, v24.4S,v30.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v2.4s, v23.4s, v10.4s -add v23.4s, v23.4s, v10.4s -ldr q10, [x0, #80] -ldr q26, [x0, #16] -mla v24.4S, v12.4S, v31.s[0] -mla v6.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sqrdmulh v16.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v12.4s, v5.4s, v1.4s -nop -sqrdmulh v15.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v5.4s, v5.4s, v1.4s -nop -sqrdmulh v1.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v8.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v20.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v2.4S, v29.s[2] -mla v18.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v24.4s -str q11, [x0, #64] -sqrdmulh v11.4S, v0.4S, v29.s[2] -mla v4.4S, v15.4S, v31.s[0] -add v26.4s, v26.4s, v24.4s -nop -sqrdmulh v24.4S, v23.4S, v29.s[1] -mla v19.4S, v1.4S, v31.s[0] -str q3, [x0, #0] -nop -sqrdmulh v3.4S, v22.4S, v29.s[1] -mla v13.4S, v25.4S, v31.s[0] -nop -nop -mul v0.4S, v0.4S,v30.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v25.4s, v12.4s, v18.4s -add v12.4s, v12.4s, v18.4s -mla v0.4S, v11.4S, v31.s[0] -mla v2.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -mul v22.4S, v22.4S,v30.s[1] -mul v23.4S, v23.4S,v30.s[1] -sub v4.4s, v5.4s, v19.4s -add v5.4s, v5.4s, v19.4s -mla v22.4S, v3.4S, v31.s[0] -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v29.4S, v25.4S, v14.s[3] -mul v25.4S, v25.4S,v27.s[3] -nop -nop -sqrdmulh v30.4S, v12.4S, v14.s[2] -mul v12.4S, v12.4S,v27.s[2] -sub v13.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v14.s[1] -mul v4.4S, v4.4S,v27.s[1] -sub v3.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v5.4S, v14.s[0] -mul v5.4S, v5.4S,v27.s[0] -sub v19.4s, v10.4s, v23.4s -add v10.4s, v10.4s, v23.4s -sqrdmulh v23.4S, v6.4S, v14.s[3] -mla v25.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v22.4s -add v26.4s, v26.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v14.s[2] -mla v12.4S, v30.4S, v31.s[0] -nop -nop -sqrdmulh v30.4S, v24.4S, v14.s[1] -mla v4.4S, v2.4S, v31.s[0] -nop -nop -sqrdmulh v2.4S, v21.4S, v14.s[0] -mla v5.4S, v0.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v27.s[2] -mul v6.4S, v6.4S,v27.s[3] -sub v0.4s, v13.4s, v25.4s -add v13.4s, v13.4s, v25.4s -mla v8.4S, v22.4S, v31.s[0] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mul v21.4S, v21.4S,v27.s[0] -mul v24.4S, v24.4S,v27.s[1] -sub v12.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -mla v21.4S, v2.4S, v31.s[0] -mla v24.4S, v30.4S, v31.s[0] -sub v30.4s, v10.4s, v5.4s -add v10.4s, v10.4s, v5.4s -sqrdmulh v14.4S, v0.4S, v9.s[3] -mul v0.4S, v0.4S,v28.s[3] -nop -nop -sqrdmulh v27.4S, v13.4S, v9.s[2] -mul v13.4S, v13.4S,v28.s[2] -sub v5.4s, v3.4s, v6.4s -add v3.4s, v3.4s, v6.4s -sqrdmulh v6.4S, v23.4S, v9.s[1] -mul v23.4S, v23.4S,v28.s[1] -sub v2.4s, v16.4s, v8.4s -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v20.4S, v9.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v4.4s, v29.4s, v24.4s -add v29.4s, v29.4s, v24.4s -sqrdmulh v9.4S, v12.4S, v7.s[3] -mla v0.4S, v14.4S, v31.s[0] -sub v14.4s, v26.4s, v21.4s -add v26.4s, v26.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v7.s[2] -mla v13.4S, v27.4S, v31.s[0] -sub v27.4s, v5.4s, v0.4s -str q27, [x0, #976] -sqrdmulh v27.4S, v30.4S, v7.s[1] -mla v23.4S, v6.4S, v31.s[0] -add v5.4s, v5.4s, v0.4s -str q5, [x0, #912] -sqrdmulh v5.4S, v10.4S, v7.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v3.4s, v13.4s -str q8, [x0, #848] -mul v19.4S, v19.4S,v17.s[2] -mul v12.4S, v12.4S,v17.s[3] -add v3.4s, v3.4s, v13.4s -sub v13.4s, v2.4s, v23.4s -mla v19.4S, v21.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v23.4s -str q3, [x0, #784] -mul v10.4S, v10.4S,v17.s[0] -mul v30.4S, v30.4S,v17.s[1] -sub v3.4s, v16.4s, v20.4s -str q13, [x0, #720] -mla v10.4S, v5.4S, v31.s[0] -mla v30.4S, v27.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -str q2, [x0, #656] -str q3, [x0, #592] -sub v3.4s, v4.4s, v12.4s -str q16, [x0, #528] -add v4.4s, v4.4s, v12.4s -str q3, [x0, #464] -sub v3.4s, v29.4s, v19.4s -str q4, [x0, #400] -add v29.4s, v29.4s, v19.4s -str q3, [x0, #336] -sub v3.4s, v14.4s, v30.4s -str q29, [x0, #272] -add v14.4s, v14.4s, v30.4s -str q3, [x0, #208] -sub v3.4s, v26.4s, v10.4s -str q14, [x0, #144] -add v26.4s, v26.4s, v10.4s -str q3, [x0, #80] -str q26, [x0, #16] -ldr q15, [x0, #224] -ldr q1, [x0, #160] -ldr q18, [x0, #32] -ldr q11, [x17, #+128] -ldr q25, [x17, #+144] -sqrdmulh v22.4S, v18.4S, v25.s[0] -mul v18.4S, v18.4S,v11.s[0] -ldr q24, [x0, #48] -sqrdmulh v28.4S, v24.4S, v25.s[0] -mul v24.4S, v24.4S,v11.s[0] -ldr q6, [x17, #+160] -ldr q0, [x17, #+176] -ldr q8, [x0, #96] -sqrdmulh v21.4S, v8.4S, v0.s[0] -mul v8.4S, v8.4S,v6.s[0] -ldr q9, [x0, #112] -sqrdmulh v23.4S, v9.4S, v0.s[0] -mul v9.4S, v9.4S,v6.s[0] -ldr q13, [x17, #+192] -ldr q5, [x17, #+208] -mla v18.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v1.4S, v5.s[0] -ldr q27, [x0, #176] -mla v24.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v27.4S, v5.s[0] -ldr q20, [x17, #+224] -ldr q2, [x17, #+240] -mla v8.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v2.s[0] -ldr q17, [x0, #240] -mla v9.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v17.4S, v2.s[0] -ldr q7, [x0, #0] -ldr q16, [x0, #128] -mul v1.4S, v1.4S,v13.s[0] -sub v12.4s, v7.4s, v18.4s -ldr q4, [x0, #16] -mul v27.4S, v27.4S,v13.s[0] -add v7.4s, v7.4s, v18.4s -ldr q18, [x0, #144] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v4.4s, v24.4s -ldr q19, [x0, #64] -mla v27.4S, v28.4S, v31.s[0] -add v4.4s, v4.4s, v24.4s -ldr q24, [x0, #192] -mul v15.4S, v15.4S,v20.s[0] -sub v28.4s, v19.4s, v8.4s -ldr q29, [x0, #80] -mul v17.4S, v17.4S,v20.s[0] -add v19.4s, v19.4s, v8.4s -ldr q8, [x0, #208] -mla v15.4S, v21.4S, v31.s[0] -mla v17.4S, v23.4S, v31.s[0] -sub v23.4s, v29.4s, v9.4s -sqrdmulh v21.4S, v4.4S, v25.s[1] -add v29.4s, v29.4s, v9.4s -mul v4.4S, v4.4S,v11.s[1] -sqrdmulh v9.4S, v22.4S, v25.s[2] -sub v30.4s, v16.4s, v1.4s -mul v22.4S, v22.4S,v11.s[2] -add v16.4s, v16.4s, v1.4s -sqrdmulh v25.4S, v29.4S, v0.s[1] -sub v11.4s, v18.4s, v27.4s -mul v29.4S, v29.4S,v6.s[1] -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v23.4S, v0.s[2] -sub v1.4s, v24.4s, v15.4s -mul v23.4S, v23.4S,v6.s[2] -add v24.4s, v24.4s, v15.4s -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v17.4s -ldr q0, [x0, #480] -sqrdmulh v6.4S, v18.4S, v5.s[1] -add v8.4s, v8.4s, v17.4s -mla v22.4S, v9.4S, v31.s[0] -ldr q9, [x0, #416] -sqrdmulh v17.4S, v11.4S, v5.s[2] -sub v15.4s, v7.4s, v4.4s -mla v29.4S, v25.4S, v31.s[0] -ldr q25, [x0, #288] -sqrdmulh v14.4S, v8.4S, v2.s[1] -add v7.4s, v7.4s, v4.4s -str q15, [x0, #16] -mla v23.4S, v27.4S, v31.s[0] -ldr q27, [x17, #+256] -ldr q15, [x17, #+272] -sqrdmulh v4.4S, v21.4S, v2.s[2] -sub v10.4s, v12.4s, v22.4s -str q7, [x0, #0] -mul v18.4S, v18.4S,v13.s[1] -add v12.4s, v12.4s, v22.4s -mul v11.4S, v11.4S,v13.s[2] -str q10, [x0, #48] -mla v18.4S, v6.4S, v31.s[0] -sub v6.4s, v19.4s, v29.4s -mla v11.4S, v17.4S, v31.s[0] -str q12, [x0, #32] -mul v8.4S, v8.4S,v20.s[1] -str q6, [x0, #80] -mul v21.4S, v21.4S,v20.s[2] -add v19.4s, v19.4s, v29.4s -str q19, [x0, #64] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v23.4s -str q14, [x0, #112] -mla v21.4S, v4.4S, v31.s[0] -add v28.4s, v28.4s, v23.4s -str q28, [x0, #96] -sqrdmulh v2.4S, v25.4S, v15.s[0] -sub v20.4s, v16.4s, v18.4s -mul v25.4S, v25.4S,v27.s[0] -str q20, [x0, #144] -ldr q20, [x0, #304] -sqrdmulh v28.4S, v20.4S, v15.s[0] -add v16.4s, v16.4s, v18.4s -mul v20.4S, v20.4S,v27.s[0] -str q16, [x0, #128] -ldr q16, [x17, #+288] -ldr q18, [x17, #+304] -ldr q23, [x0, #352] -sqrdmulh v4.4S, v23.4S, v18.s[0] -sub v14.4s, v30.4s, v11.4s -mul v23.4S, v23.4S,v16.s[0] -str q14, [x0, #176] -ldr q14, [x0, #368] -sqrdmulh v19.4S, v14.4S, v18.s[0] -add v30.4s, v30.4s, v11.4s -mul v14.4S, v14.4S,v16.s[0] -str q30, [x0, #160] -ldr q30, [x17, #+320] -ldr q11, [x17, #+336] -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v24.4s, v8.4s -sqrdmulh v29.4S, v9.4S, v11.s[0] -str q2, [x0, #208] -ldr q2, [x0, #432] -mla v20.4S, v28.4S, v31.s[0] -add v24.4s, v24.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v11.s[0] -str q24, [x0, #192] -ldr q24, [x17, #+352] -ldr q28, [x17, #+368] -mla v23.4S, v4.4S, v31.s[0] -sub v4.4s, v1.4s, v21.4s -sqrdmulh v6.4S, v0.4S, v28.s[0] -str q4, [x0, #240] -ldr q4, [x0, #496] -mla v14.4S, v19.4S, v31.s[0] -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v4.4S, v28.s[0] -str q1, [x0, #224] -ldr q1, [x0, #256] -ldr q19, [x0, #384] -mul v9.4S, v9.4S,v30.s[0] -sub v5.4s, v1.4s, v25.4s -ldr q13, [x0, #272] -mul v2.4S, v2.4S,v30.s[0] -add v1.4s, v1.4s, v25.4s -ldr q25, [x0, #400] -mla v9.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v20.4s -ldr q12, [x0, #320] -mla v2.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v20.4s -ldr q20, [x0, #448] -mul v0.4S, v0.4S,v24.s[0] -sub v8.4s, v12.4s, v23.4s -ldr q17, [x0, #336] -mul v4.4S, v4.4S,v24.s[0] -add v12.4s, v12.4s, v23.4s -ldr q23, [x0, #464] -mla v0.4S, v6.4S, v31.s[0] -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v14.4s -sqrdmulh v6.4S, v13.4S, v15.s[1] -add v17.4s, v17.4s, v14.4s -mul v13.4S, v13.4S,v27.s[1] -sqrdmulh v14.4S, v29.4S, v15.s[2] -sub v10.4s, v19.4s, v9.4s -mul v29.4S, v29.4S,v27.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v15.4S, v17.4S, v18.s[1] -sub v27.4s, v25.4s, v2.4s -mul v17.4S, v17.4S,v16.s[1] -add v25.4s, v25.4s, v2.4s -sqrdmulh v2.4S, v21.4S, v18.s[2] -sub v9.4s, v20.4s, v0.4s -mul v21.4S, v21.4S,v16.s[2] -add v20.4s, v20.4s, v0.4s -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v23.4s, v4.4s -ldr q18, [x0, #736] -sqrdmulh v16.4S, v25.4S, v11.s[1] -add v23.4s, v23.4s, v4.4s -mla v29.4S, v14.4S, v31.s[0] -ldr q14, [x0, #672] -sqrdmulh v4.4S, v27.4S, v11.s[2] -sub v0.4s, v1.4s, v13.4s -mla v17.4S, v15.4S, v31.s[0] -ldr q15, [x0, #544] -sqrdmulh v22.4S, v23.4S, v28.s[1] -add v1.4s, v1.4s, v13.4s -str q0, [x0, #272] -mla v21.4S, v2.4S, v31.s[0] -ldr q2, [x17, #+384] -ldr q0, [x17, #+400] -sqrdmulh v13.4S, v6.4S, v28.s[2] -sub v7.4s, v5.4s, v29.4s -str q1, [x0, #256] -mul v25.4S, v25.4S,v30.s[1] -add v5.4s, v5.4s, v29.4s -mul v27.4S, v27.4S,v30.s[2] -str q7, [x0, #304] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v17.4s -mla v27.4S, v4.4S, v31.s[0] -str q5, [x0, #288] -mul v23.4S, v23.4S,v24.s[1] -str q16, [x0, #336] -mul v6.4S, v6.4S,v24.s[2] -add v12.4s, v12.4s, v17.4s -str q12, [x0, #320] -mla v23.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v21.4s -str q22, [x0, #368] -mla v6.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v21.4s -str q8, [x0, #352] -sqrdmulh v28.4S, v15.4S, v0.s[0] -sub v24.4s, v19.4s, v25.4s -mul v15.4S, v15.4S,v2.s[0] -str q24, [x0, #400] -ldr q24, [x0, #560] -sqrdmulh v8.4S, v24.4S, v0.s[0] -add v19.4s, v19.4s, v25.4s -mul v24.4S, v24.4S,v2.s[0] -str q19, [x0, #384] -ldr q19, [x17, #+416] -ldr q25, [x17, #+432] -ldr q21, [x0, #608] -sqrdmulh v13.4S, v21.4S, v25.s[0] -sub v22.4s, v10.4s, v27.4s -mul v21.4S, v21.4S,v19.s[0] -str q22, [x0, #432] -ldr q22, [x0, #624] -sqrdmulh v12.4S, v22.4S, v25.s[0] -add v10.4s, v10.4s, v27.4s -mul v22.4S, v22.4S,v19.s[0] -str q10, [x0, #416] -ldr q10, [x17, #+448] -ldr q27, [x17, #+464] -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v20.4s, v23.4s -sqrdmulh v17.4S, v14.4S, v27.s[0] -str q28, [x0, #464] -ldr q28, [x0, #688] -mla v24.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v23.4s -sqrdmulh v23.4S, v28.4S, v27.s[0] -str q20, [x0, #448] -ldr q20, [x17, #+480] -ldr q8, [x17, #+496] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v9.4s, v6.4s -sqrdmulh v16.4S, v18.4S, v8.s[0] -str q13, [x0, #496] -ldr q13, [x0, #752] -mla v22.4S, v12.4S, v31.s[0] -add v9.4s, v9.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v8.s[0] -str q9, [x0, #480] -ldr q9, [x0, #512] -ldr q12, [x0, #640] -mul v14.4S, v14.4S,v10.s[0] -sub v11.4s, v9.4s, v15.4s -ldr q30, [x0, #528] -mul v28.4S, v28.4S,v10.s[0] -add v9.4s, v9.4s, v15.4s -ldr q15, [x0, #656] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v30.4s, v24.4s -ldr q5, [x0, #576] -mla v28.4S, v23.4S, v31.s[0] -add v30.4s, v30.4s, v24.4s -ldr q24, [x0, #704] -mul v18.4S, v18.4S,v20.s[0] -sub v23.4s, v5.4s, v21.4s -ldr q4, [x0, #592] -mul v13.4S, v13.4S,v20.s[0] -add v5.4s, v5.4s, v21.4s -ldr q21, [x0, #720] -mla v18.4S, v16.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v22.4s -sqrdmulh v16.4S, v30.4S, v0.s[1] -add v4.4s, v4.4s, v22.4s -mul v30.4S, v30.4S,v2.s[1] -sqrdmulh v22.4S, v17.4S, v0.s[2] -sub v7.4s, v12.4s, v14.4s -mul v17.4S, v17.4S,v2.s[2] -add v12.4s, v12.4s, v14.4s -sqrdmulh v0.4S, v4.4S, v25.s[1] -sub v2.4s, v15.4s, v28.4s -mul v4.4S, v4.4S,v19.s[1] -add v15.4s, v15.4s, v28.4s -sqrdmulh v28.4S, v6.4S, v25.s[2] -sub v14.4s, v24.4s, v18.4s -mul v6.4S, v6.4S,v19.s[2] -add v24.4s, v24.4s, v18.4s -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v13.4s -ldr q25, [x0, #992] -sqrdmulh v19.4S, v15.4S, v27.s[1] -add v21.4s, v21.4s, v13.4s -mla v17.4S, v22.4S, v31.s[0] -ldr q22, [x0, #928] -sqrdmulh v13.4S, v2.4S, v27.s[2] -sub v18.4s, v9.4s, v30.4s -mla v4.4S, v0.4S, v31.s[0] -ldr q0, [x0, #800] -sqrdmulh v29.4S, v21.4S, v8.s[1] -add v9.4s, v9.4s, v30.4s -str q18, [x0, #528] -mla v6.4S, v28.4S, v31.s[0] -ldr q28, [x17, #+512] -ldr q18, [x17, #+528] -sqrdmulh v30.4S, v16.4S, v8.s[2] -sub v1.4s, v11.4s, v17.4s -str q9, [x0, #512] -mul v15.4S, v15.4S,v10.s[1] -add v11.4s, v11.4s, v17.4s -mul v2.4S, v2.4S,v10.s[2] -str q1, [x0, #560] -mla v15.4S, v19.4S, v31.s[0] -sub v19.4s, v5.4s, v4.4s -mla v2.4S, v13.4S, v31.s[0] -str q11, [x0, #544] -mul v21.4S, v21.4S,v20.s[1] -str q19, [x0, #592] -mul v16.4S, v16.4S,v20.s[2] -add v5.4s, v5.4s, v4.4s -str q5, [x0, #576] -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v23.4s, v6.4s -str q29, [x0, #624] -mla v16.4S, v30.4S, v31.s[0] -add v23.4s, v23.4s, v6.4s -str q23, [x0, #608] -sqrdmulh v8.4S, v0.4S, v18.s[0] -sub v20.4s, v12.4s, v15.4s -mul v0.4S, v0.4S,v28.s[0] -str q20, [x0, #656] -ldr q20, [x0, #816] -sqrdmulh v23.4S, v20.4S, v18.s[0] -add v12.4s, v12.4s, v15.4s -mul v20.4S, v20.4S,v28.s[0] -str q12, [x0, #640] -ldr q12, [x17, #+544] -ldr q15, [x17, #+560] -ldr q6, [x0, #864] -sqrdmulh v30.4S, v6.4S, v15.s[0] -sub v29.4s, v7.4s, v2.4s -mul v6.4S, v6.4S,v12.s[0] -str q29, [x0, #688] -ldr q29, [x0, #880] -sqrdmulh v5.4S, v29.4S, v15.s[0] -add v7.4s, v7.4s, v2.4s -mul v29.4S, v29.4S,v12.s[0] -str q7, [x0, #672] -ldr q7, [x17, #+576] -ldr q2, [x17, #+592] -mla v0.4S, v8.4S, v31.s[0] -sub v8.4s, v24.4s, v21.4s -sqrdmulh v4.4S, v22.4S, v2.s[0] -str q8, [x0, #720] -ldr q8, [x0, #944] -mla v20.4S, v23.4S, v31.s[0] -add v24.4s, v24.4s, v21.4s -sqrdmulh v21.4S, v8.4S, v2.s[0] -str q24, [x0, #704] -ldr q24, [x17, #+608] -ldr q23, [x17, #+624] -mla v6.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v16.4s -sqrdmulh v19.4S, v25.4S, v23.s[0] -str q30, [x0, #752] -ldr q30, [x0, #1008] -mla v29.4S, v5.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v30.4S, v23.s[0] -str q14, [x0, #736] -ldr q14, [x0, #768] -ldr q5, [x0, #896] -mul v22.4S, v22.4S,v7.s[0] -sub v27.4s, v14.4s, v0.4s -ldr q10, [x0, #784] -mul v8.4S, v8.4S,v7.s[0] -add v14.4s, v14.4s, v0.4s -ldr q0, [x0, #912] -mla v22.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v20.4s -ldr q11, [x0, #832] -mla v8.4S, v21.4S, v31.s[0] -add v10.4s, v10.4s, v20.4s -ldr q20, [x0, #960] -mul v25.4S, v25.4S,v24.s[0] -sub v21.4s, v11.4s, v6.4s -ldr q13, [x0, #848] -mul v30.4S, v30.4S,v24.s[0] -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #976] -mla v25.4S, v19.4S, v31.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v13.4s, v29.4s -sqrdmulh v19.4S, v10.4S, v18.s[1] -add v13.4s, v13.4s, v29.4s -mul v10.4S, v10.4S,v28.s[1] -sqrdmulh v29.4S, v4.4S, v18.s[2] -sub v1.4s, v5.4s, v22.4s -mul v4.4S, v4.4S,v28.s[2] -add v5.4s, v5.4s, v22.4s -sqrdmulh v18.4S, v13.4S, v15.s[1] -sub v28.4s, v0.4s, v8.4s -mul v13.4S, v13.4S,v12.s[1] -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v15.s[2] -sub v22.4s, v20.4s, v25.4s -mul v16.4S, v16.4S,v12.s[2] -add v20.4s, v20.4s, v25.4s -mla v10.4S, v19.4S, v31.s[0] -sub v19.4s, v6.4s, v30.4s -sqrdmulh v15.4S, v0.4S, v2.s[1] -add v6.4s, v6.4s, v30.4s -mla v4.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v28.4S, v2.s[2] -sub v30.4s, v14.4s, v10.4s -mla v13.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v6.4S, v23.s[1] -add v14.4s, v14.4s, v10.4s -str q30, [x0, #784] -mla v16.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v19.4S, v23.s[2] -sub v30.4s, v27.4s, v4.4s -str q14, [x0, #768] -mul v0.4S, v0.4S,v7.s[1] -add v27.4s, v27.4s, v4.4s -mul v28.4S, v28.4S,v7.s[2] -str q30, [x0, #816] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v13.4s -mla v28.4S, v29.4S, v31.s[0] -str q27, [x0, #800] -mul v6.4S, v6.4S,v24.s[1] -str q15, [x0, #848] -mul v19.4S, v19.4S,v24.s[2] -add v11.4s, v11.4s, v13.4s -str q11, [x0, #832] -mla v6.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v16.4s -str q18, [x0, #880] -mla v19.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v16.4s -str q21, [x0, #864] -sub v23.4s, v5.4s, v0.4s -str q23, [x0, #912] -add v5.4s, v5.4s, v0.4s -str q5, [x0, #896] -sub v5.4s, v1.4s, v28.4s -str q5, [x0, #944] -add v1.4s, v1.4s, v28.4s -str q1, [x0, #928] -sub v1.4s, v20.4s, v6.4s -str q1, [x0, #976] -add v20.4s, v20.4s, v6.4s -str q20, [x0, #960] -sub v20.4s, v22.4s, v19.4s -str q20, [x0, #1008] -add v22.4s, v22.4s, v19.4s -str q22, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1528 -// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s deleted file mode 100644 index db8d7f4..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_19_z4_7.s +++ /dev/null @@ -1,1558 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_19_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -ldr q3, [x0, #416] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v2.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -ldr q28, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -sub v19.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -ldr q26, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -sub v0.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -ldr q24, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v26.4s, v20.4s -nop -sqrdmulh v14.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -add v26.4s, v26.4s, v20.4s -nop -sqrdmulh v20.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v24.4s, v25.4s -add v24.4s, v24.4s, v25.4s -sqrdmulh v25.4S, v0.4S, v29.s[2] -mla v2.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v17.4S, v29.s[2] -mla v19.4S, v14.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -nop -sqrdmulh v23.4S, v28.4S, v29.s[1] -mla v21.4S, v20.4S, v31.s[0] -nop -sqrdmulh v20.4S, v1.4S, v29.s[1] -mla v3.4S, v27.4S, v31.s[0] -nop -nop -ldr q27, [x17, #+32] -ldr q14, [x17, #+48] -mul v17.4S, v17.4S,v30.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v10.4s, v16.4s, v2.4s -add v16.4s, v16.4s, v2.4s -mla v17.4S, v11.4S, v31.s[0] -mla v0.4S, v25.4S, v31.s[0] -sub v25.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -mul v1.4S, v1.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v19.4s, v26.4s, v21.4s -add v26.4s, v26.4s, v21.4s -mla v1.4S, v20.4S, v31.s[0] -mla v28.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v14.s[3] -mul v10.4S, v10.4S,v27.s[3] -sub v20.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v14.s[2] -mul v16.4S, v16.4S,v27.s[2] -sub v21.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v14.s[1] -mul v19.4S, v19.4S,v27.s[1] -sub v11.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v14.s[0] -mul v26.4S, v26.4S,v27.s[0] -sub v2.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v25.4S, v14.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v13.4S, v14.s[2] -mla v16.4S, v0.4S, v31.s[0] -nop -nop -sqrdmulh v0.4S, v23.4S, v14.s[1] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -sqrdmulh v17.4S, v18.4S, v14.s[0] -mla v26.4S, v28.4S, v31.s[0] -nop -nop -ldr q28, [x17, #+64] -ldr q7, [x17, #+80] -mul v13.4S, v13.4S,v27.s[2] -mul v25.4S, v25.4S,v27.s[3] -sub v6.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v13.4S, v3.4S, v31.s[0] -mla v25.4S, v8.4S, v31.s[0] -sub v8.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -mul v18.4S, v18.4S,v27.s[0] -mul v23.4S, v23.4S,v27.s[1] -sub v16.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -mla v18.4S, v17.4S, v31.s[0] -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v17.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v20.4S, v9.s[2] -mul v20.4S, v20.4S,v1.s[2] -sub v19.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v2.4s, v23.4s -add v2.4s, v2.4s, v23.4s -sqrdmulh v23.4S, v12.4S, v9.s[0] -mul v12.4S, v12.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v7.s[3] -mla v6.4S, v26.4S, v31.s[0] -nop -nop -sqrdmulh v26.4S, v11.4S, v7.s[2] -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v17.4s, v6.4s -str q25, [x0, #992] -sqrdmulh v25.4S, v0.4S, v7.s[1] -mla v8.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v6.4s -str q17, [x0, #928] -sqrdmulh v17.4S, v24.4S, v7.s[0] -mla v12.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v20.4s -str q23, [x0, #864] -mul v11.4S, v11.4S,v28.s[2] -mul v16.4S, v16.4S,v28.s[3] -add v21.4s, v21.4s, v20.4s -sub v20.4s, v19.4s, v8.4s -mla v11.4S, v26.4S, v31.s[0] -mla v16.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v8.4s -str q21, [x0, #800] -mul v24.4S, v24.4S,v28.s[0] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v22.4s, v12.4s -str q20, [x0, #736] -mla v24.4S, v17.4S, v31.s[0] -mla v0.4S, v25.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -str q19, [x0, #672] -ldr q19, [x0, #1008] -sqrdmulh v12.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -str q21, [x0, #608] -sub v21.4s, v3.4s, v16.4s -ldr q25, [x0, #944] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v30.s[0] -str q22, [x0, #544] -add v3.4s, v3.4s, v16.4s -ldr q16, [x0, #880] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q21, [x0, #480] -sub v21.4s, v2.4s, v11.4s -ldr q20, [x0, #816] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -str q3, [x0, #416] -add v2.4s, v2.4s, v11.4s -ldr q11, [x0, #752] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mla v19.4S, v12.4S, v31.s[0] -str q21, [x0, #352] -sub v21.4s, v10.4s, v0.4s -ldr q12, [x0, #688] -sqrdmulh v18.4S, v12.4S, v29.s[0] -mla v25.4S, v17.4S, v31.s[0] -str q2, [x0, #288] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #624] -sqrdmulh v2.4S, v0.4S, v29.s[0] -mla v16.4S, v22.4S, v31.s[0] -str q21, [x0, #224] -sub v21.4s, v15.4s, v24.4s -ldr q22, [x0, #560] -sqrdmulh v17.4S, v22.4S, v29.s[0] -mla v20.4S, v8.4S, v31.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v24.4s -ldr q24, [x0, #496] -ldr q10, [x0, #432] -mul v12.4S, v12.4S,v30.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v8.4s, v24.4s, v19.4s -add v24.4s, v24.4s, v19.4s -ldr q19, [x0, #368] -ldr q26, [x0, #304] -mla v12.4S, v18.4S, v31.s[0] -mla v11.4S, v3.4S, v31.s[0] -sub v3.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -ldr q25, [x0, #240] -ldr q18, [x0, #176] -mul v22.4S, v22.4S,v30.s[0] -mul v0.4S, v0.4S,v30.s[0] -sub v23.4s, v19.4s, v16.4s -add v19.4s, v19.4s, v16.4s -ldr q16, [x0, #112] -ldr q6, [x0, #48] -mla v22.4S, v17.4S, v31.s[0] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v17.4s, v25.4s, v11.4s -nop -sqrdmulh v13.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -add v25.4s, v25.4s, v11.4s -nop -sqrdmulh v11.4S, v24.4S, v29.s[1] -mul v24.4S, v24.4S,v30.s[1] -sub v5.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v10.4S, v29.s[1] -mul v10.4S, v10.4S,v30.s[1] -sub v4.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v23.4S, v29.s[2] -mla v8.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v22.4s -str q21, [x0, #96] -sqrdmulh v21.4S, v2.4S, v29.s[2] -mla v3.4S, v13.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -nop -sqrdmulh v22.4S, v19.4S, v29.s[1] -mla v24.4S, v11.4S, v31.s[0] -str q15, [x0, #32] -nop -sqrdmulh v15.4S, v26.4S, v29.s[1] -mla v10.4S, v12.4S, v31.s[0] -nop -nop -mul v2.4S, v2.4S,v30.s[2] -mul v23.4S, v23.4S,v30.s[2] -sub v12.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v2.4S, v21.4S, v31.s[0] -mla v23.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v3.4s -add v5.4s, v5.4s, v3.4s -mul v26.4S, v26.4S,v30.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v3.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -mla v26.4S, v15.4S, v31.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v12.4S, v14.s[3] -mul v12.4S, v12.4S,v27.s[3] -sub v15.4s, v4.4s, v23.4s -add v4.4s, v4.4s, v23.4s -sqrdmulh v23.4S, v17.4S, v14.s[2] -mul v17.4S, v17.4S,v27.s[2] -sub v24.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v14.s[1] -mul v3.4S, v3.4S,v27.s[1] -sub v21.4s, v16.4s, v19.4s -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v14.s[0] -mul v25.4S, v25.4S,v27.s[0] -sub v8.4s, v6.4s, v26.4s -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v0.4S, v14.s[3] -mla v12.4S, v10.4S, v31.s[0] -nop -nop -sqrdmulh v10.4S, v5.4S, v14.s[2] -mla v17.4S, v23.4S, v31.s[0] -nop -nop -sqrdmulh v23.4S, v22.4S, v14.s[1] -mla v3.4S, v2.4S, v31.s[0] -nop -nop -sqrdmulh v2.4S, v18.4S, v14.s[0] -mla v25.4S, v19.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v27.s[2] -mul v0.4S, v0.4S,v27.s[3] -sub v19.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -mla v5.4S, v10.4S, v31.s[0] -mla v0.4S, v26.4S, v31.s[0] -sub v26.4s, v4.4s, v17.4s -add v4.4s, v4.4s, v17.4s -mul v18.4S, v18.4S,v27.s[0] -mul v22.4S, v22.4S,v27.s[1] -sub v17.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -mla v18.4S, v2.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v16.4s, v25.4s -add v16.4s, v16.4s, v25.4s -sqrdmulh v25.4S, v19.4S, v9.s[3] -mul v19.4S, v19.4S,v1.s[3] -sub v2.4s, v24.4s, v0.4s -add v24.4s, v24.4s, v0.4s -sqrdmulh v0.4S, v15.4S, v9.s[2] -mul v15.4S, v15.4S,v1.s[2] -sub v3.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v26.4S, v9.s[1] -mul v26.4S, v26.4S,v1.s[1] -sub v10.4s, v8.4s, v22.4s -add v8.4s, v8.4s, v22.4s -sqrdmulh v22.4S, v4.4S, v9.s[0] -mul v4.4S, v4.4S,v1.s[0] -sub v12.4s, v6.4s, v18.4s -add v6.4s, v6.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v7.s[3] -mla v19.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v21.4S, v7.s[2] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v2.4s, v19.4s -str q0, [x0, #1008] -sqrdmulh v0.4S, v23.4S, v7.s[1] -mla v26.4S, v5.4S, v31.s[0] -add v2.4s, v2.4s, v19.4s -str q2, [x0, #944] -sqrdmulh v2.4S, v16.4S, v7.s[0] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v24.4s, v15.4s -str q22, [x0, #880] -mul v21.4S, v21.4S,v28.s[2] -mul v17.4S, v17.4S,v28.s[3] -add v24.4s, v24.4s, v15.4s -sub v15.4s, v3.4s, v26.4s -mla v21.4S, v25.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -add v3.4s, v3.4s, v26.4s -str q24, [x0, #816] -mul v16.4S, v16.4S,v28.s[0] -mul v23.4S, v23.4S,v28.s[1] -sub v24.4s, v20.4s, v4.4s -str q15, [x0, #752] -mla v16.4S, v2.4S, v31.s[0] -mla v23.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v4.4s -str q3, [x0, #688] -ldr q3, [x0, #960] -sqrdmulh v4.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -str q24, [x0, #624] -sub v24.4s, v10.4s, v17.4s -ldr q0, [x0, #896] -sqrdmulh v2.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -str q20, [x0, #560] -add v10.4s, v10.4s, v17.4s -ldr q17, [x0, #832] -sqrdmulh v20.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -str q24, [x0, #496] -sub v24.4s, v8.4s, v21.4s -ldr q15, [x0, #768] -sqrdmulh v26.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -str q10, [x0, #432] -add v8.4s, v8.4s, v21.4s -ldr q21, [x0, #704] -sqrdmulh v10.4S, v21.4S, v29.s[0] -mla v3.4S, v4.4S, v31.s[0] -str q24, [x0, #368] -sub v24.4s, v12.4s, v23.4s -ldr q4, [x0, #640] -sqrdmulh v18.4S, v4.4S, v29.s[0] -mla v0.4S, v2.4S, v31.s[0] -str q8, [x0, #304] -add v12.4s, v12.4s, v23.4s -ldr q23, [x0, #576] -sqrdmulh v8.4S, v23.4S, v29.s[0] -mla v17.4S, v20.4S, v31.s[0] -str q24, [x0, #240] -sub v24.4s, v6.4s, v16.4s -ldr q20, [x0, #512] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mla v15.4S, v26.4S, v31.s[0] -str q12, [x0, #176] -add v6.4s, v6.4s, v16.4s -ldr q16, [x0, #448] -ldr q12, [x0, #384] -mul v4.4S, v4.4S,v30.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v26.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #320] -ldr q25, [x0, #256] -mla v4.4S, v18.4S, v31.s[0] -mla v21.4S, v10.4S, v31.s[0] -sub v10.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -ldr q0, [x0, #192] -ldr q18, [x0, #128] -mul v20.4S, v20.4S,v30.s[0] -mul v23.4S, v23.4S,v30.s[0] -sub v22.4s, v3.4s, v17.4s -add v3.4s, v3.4s, v17.4s -ldr q17, [x0, #64] -ldr q19, [x0, #0] -mla v20.4S, v2.4S, v31.s[0] -mla v23.4S, v8.4S, v31.s[0] -sub v8.4s, v25.4s, v15.4s -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v26.4S, v29.s[2] -mul v26.4S, v26.4S,v30.s[2] -sub v2.4s, v0.4s, v21.4s -nop -sqrdmulh v5.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v0.4s, v0.4s, v21.4s -nop -sqrdmulh v21.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v11.4s, v18.4s, v4.4s -add v18.4s, v18.4s, v4.4s -sqrdmulh v4.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v13.4s, v17.4s, v23.4s -add v17.4s, v17.4s, v23.4s -sqrdmulh v23.4S, v22.4S, v29.s[2] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v19.4s, v20.4s -str q24, [x0, #112] -sqrdmulh v24.4S, v8.4S, v29.s[2] -mla v10.4S, v5.4S, v31.s[0] -add v19.4s, v19.4s, v20.4s -nop -sqrdmulh v20.4S, v3.4S, v29.s[1] -mla v16.4S, v21.4S, v31.s[0] -str q6, [x0, #48] -nop -sqrdmulh v6.4S, v25.4S, v29.s[1] -mla v12.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v30.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v4.4s, v2.4s, v26.4s -add v2.4s, v2.4s, v26.4s -mla v8.4S, v24.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mul v25.4S, v25.4S,v30.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v10.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -mla v25.4S, v6.4S, v31.s[0] -mla v3.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v14.s[3] -mul v4.4S, v4.4S,v27.s[3] -sub v6.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sqrdmulh v22.4S, v2.4S, v14.s[2] -mul v2.4S, v2.4S,v27.s[2] -sub v16.4s, v15.4s, v8.4s -add v15.4s, v15.4s, v8.4s -sqrdmulh v8.4S, v10.4S, v14.s[1] -mul v10.4S, v10.4S,v27.s[1] -sub v24.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v14.s[0] -mul v0.4S, v0.4S,v27.s[0] -sub v26.4s, v19.4s, v25.4s -add v19.4s, v19.4s, v25.4s -sqrdmulh v25.4S, v23.4S, v14.s[3] -mla v4.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v11.4S, v14.s[2] -mla v2.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v20.4S, v14.s[1] -mla v10.4S, v8.4S, v31.s[0] -nop -nop -sqrdmulh v8.4S, v18.4S, v14.s[0] -mla v0.4S, v3.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v27.s[2] -mul v23.4S, v23.4S,v27.s[3] -sub v3.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -mla v11.4S, v12.4S, v31.s[0] -mla v23.4S, v25.4S, v31.s[0] -sub v25.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -mul v18.4S, v18.4S,v27.s[0] -mul v20.4S, v20.4S,v27.s[1] -sub v2.4s, v24.4s, v10.4s -add v24.4s, v24.4s, v10.4s -mla v18.4S, v8.4S, v31.s[0] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v3.4S, v9.s[3] -mul v3.4S, v3.4S,v1.s[3] -sub v8.4s, v16.4s, v23.4s -add v16.4s, v16.4s, v23.4s -sqrdmulh v23.4S, v6.4S, v9.s[2] -mul v6.4S, v6.4S,v1.s[2] -sub v10.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v25.4S, v9.s[1] -mul v25.4S, v25.4S,v1.s[1] -sub v12.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v9.s[0] -mul v13.4S, v13.4S,v1.s[0] -sub v4.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v7.s[3] -mla v3.4S, v0.4S, v31.s[0] -nop -nop -sqrdmulh v0.4S, v24.4S, v7.s[2] -mla v6.4S, v23.4S, v31.s[0] -sub v23.4s, v8.4s, v3.4s -str q23, [x0, #960] -sqrdmulh v23.4S, v22.4S, v7.s[1] -mla v25.4S, v11.4S, v31.s[0] -add v8.4s, v8.4s, v3.4s -str q8, [x0, #896] -sqrdmulh v8.4S, v17.4S, v7.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v6.4s -str q20, [x0, #832] -mul v24.4S, v24.4S,v28.s[2] -mul v2.4S, v2.4S,v28.s[3] -add v16.4s, v16.4s, v6.4s -sub v6.4s, v10.4s, v25.4s -mla v24.4S, v0.4S, v31.s[0] -mla v2.4S, v18.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q16, [x0, #768] -mul v17.4S, v17.4S,v28.s[0] -mul v22.4S, v22.4S,v28.s[1] -sub v16.4s, v15.4s, v13.4s -str q6, [x0, #704] -mla v17.4S, v8.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -add v15.4s, v15.4s, v13.4s -str q10, [x0, #640] -ldr q10, [x0, #976] -sqrdmulh v13.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -str q16, [x0, #576] -sub v16.4s, v12.4s, v2.4s -ldr q23, [x0, #912] -sqrdmulh v8.4S, v23.4S, v29.s[0] -mul v23.4S, v23.4S,v30.s[0] -str q15, [x0, #512] -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #848] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -str q16, [x0, #448] -sub v16.4s, v26.4s, v24.4s -ldr q6, [x0, #784] -sqrdmulh v25.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -str q12, [x0, #384] -add v26.4s, v26.4s, v24.4s -ldr q24, [x0, #720] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v10.4S, v13.4S, v31.s[0] -str q16, [x0, #320] -sub v16.4s, v4.4s, v22.4s -ldr q13, [x0, #656] -sqrdmulh v18.4S, v13.4S, v29.s[0] -mla v23.4S, v8.4S, v31.s[0] -str q26, [x0, #256] -add v4.4s, v4.4s, v22.4s -ldr q22, [x0, #592] -sqrdmulh v26.4S, v22.4S, v29.s[0] -mla v2.4S, v15.4S, v31.s[0] -str q16, [x0, #192] -sub v16.4s, v19.4s, v17.4s -ldr q15, [x0, #528] -sqrdmulh v8.4S, v15.4S, v29.s[0] -mla v6.4S, v25.4S, v31.s[0] -str q4, [x0, #128] -add v19.4s, v19.4s, v17.4s -ldr q17, [x0, #464] -ldr q4, [x0, #400] -mul v13.4S, v13.4S,v30.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v25.4s, v17.4s, v10.4s -add v17.4s, v17.4s, v10.4s -ldr q10, [x0, #336] -ldr q0, [x0, #272] -mla v13.4S, v18.4S, v31.s[0] -mla v24.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v23.4s -add v4.4s, v4.4s, v23.4s -ldr q23, [x0, #208] -ldr q18, [x0, #144] -mul v15.4S, v15.4S,v30.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v20.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -ldr q2, [x0, #80] -ldr q3, [x0, #16] -mla v15.4S, v8.4S, v31.s[0] -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v6.4s -add v0.4s, v0.4s, v6.4s -sqrdmulh v6.4S, v25.4S, v29.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v8.4s, v23.4s, v24.4s -nop -sqrdmulh v11.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v23.4s, v23.4s, v24.4s -nop -sqrdmulh v24.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v21.4s, v18.4s, v13.4s -add v18.4s, v18.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v29.s[1] -mul v4.4S, v4.4S,v30.s[1] -sub v5.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v20.4S, v29.s[2] -mla v25.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v15.4s -str q16, [x0, #64] -sqrdmulh v16.4S, v26.4S, v29.s[2] -mla v12.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v15.4s -nop -sqrdmulh v15.4S, v10.4S, v29.s[1] -mla v17.4S, v24.4S, v31.s[0] -str q19, [x0, #0] -nop -sqrdmulh v19.4S, v0.4S, v29.s[1] -mla v4.4S, v13.4S, v31.s[0] -nop -nop -mul v26.4S, v26.4S,v30.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v13.4s, v8.4s, v25.4s -add v8.4s, v8.4s, v25.4s -mla v26.4S, v16.4S, v31.s[0] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -mul v0.4S, v0.4S,v30.s[1] -mul v10.4S, v10.4S,v30.s[1] -sub v12.4s, v23.4s, v17.4s -add v23.4s, v23.4s, v17.4s -mla v0.4S, v19.4S, v31.s[0] -mla v10.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v4.4s -add v18.4s, v18.4s, v4.4s -sqrdmulh v29.4S, v13.4S, v14.s[3] -mul v13.4S, v13.4S,v27.s[3] -sub v30.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v14.s[2] -mul v8.4S, v8.4S,v27.s[2] -sub v4.4s, v6.4s, v26.4s -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v14.s[1] -mul v12.4S, v12.4S,v27.s[1] -sub v19.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v23.4S, v14.s[0] -mul v23.4S, v23.4S,v27.s[0] -sub v17.4s, v3.4s, v0.4s -add v3.4s, v3.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v14.s[3] -mla v13.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v21.4S, v14.s[2] -mla v8.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v15.4S, v14.s[1] -mla v12.4S, v26.4S, v31.s[0] -nop -nop -sqrdmulh v26.4S, v18.4S, v14.s[0] -mla v23.4S, v10.4S, v31.s[0] -nop -nop -mul v21.4S, v21.4S,v27.s[2] -mul v22.4S, v22.4S,v27.s[3] -sub v10.4s, v30.4s, v13.4s -add v30.4s, v30.4s, v13.4s -mla v21.4S, v29.4S, v31.s[0] -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -mul v18.4S, v18.4S,v27.s[0] -mul v15.4S, v15.4S,v27.s[1] -sub v8.4s, v19.4s, v12.4s -add v19.4s, v19.4s, v12.4s -mla v18.4S, v26.4S, v31.s[0] -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v23.4s -add v2.4s, v2.4s, v23.4s -sqrdmulh v14.4S, v10.4S, v9.s[3] -mul v10.4S, v10.4S,v1.s[3] -sub v27.4s, v4.4s, v22.4s -add v4.4s, v4.4s, v22.4s -sqrdmulh v22.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v23.4s, v6.4s, v21.4s -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v9.s[1] -mul v0.4S, v0.4S,v1.s[1] -sub v26.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v5.4S, v9.s[0] -mul v5.4S, v5.4S,v1.s[0] -sub v12.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v9.4S, v8.4S, v7.s[3] -mla v10.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v19.4S, v7.s[2] -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v27.4s, v10.4s -str q22, [x0, #976] -sqrdmulh v22.4S, v20.4S, v7.s[1] -mla v0.4S, v21.4S, v31.s[0] -add v27.4s, v27.4s, v10.4s -str q27, [x0, #912] -sqrdmulh v27.4S, v2.4S, v7.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v30.4s -str q15, [x0, #848] -mul v19.4S, v19.4S,v28.s[2] -mul v8.4S, v8.4S,v28.s[3] -add v4.4s, v4.4s, v30.4s -sub v30.4s, v23.4s, v0.4s -mla v19.4S, v14.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v0.4s -str q4, [x0, #784] -mul v2.4S, v2.4S,v28.s[0] -mul v20.4S, v20.4S,v28.s[1] -sub v4.4s, v6.4s, v5.4s -str q30, [x0, #720] -mla v2.4S, v27.4S, v31.s[0] -mla v20.4S, v22.4S, v31.s[0] -add v6.4s, v6.4s, v5.4s -str q23, [x0, #656] -str q4, [x0, #592] -sub v4.4s, v26.4s, v8.4s -str q6, [x0, #528] -add v26.4s, v26.4s, v8.4s -str q4, [x0, #464] -sub v4.4s, v17.4s, v19.4s -str q26, [x0, #400] -add v17.4s, v17.4s, v19.4s -str q4, [x0, #336] -sub v4.4s, v12.4s, v20.4s -str q17, [x0, #272] -add v12.4s, v12.4s, v20.4s -str q4, [x0, #208] -sub v4.4s, v3.4s, v2.4s -str q12, [x0, #144] -add v3.4s, v3.4s, v2.4s -str q4, [x0, #80] -str q3, [x0, #16] -ldr q11, [x0, #224] -ldr q24, [x0, #160] -ldr q25, [x0, #32] -ldr q16, [x17, #+128] -ldr q13, [x17, #+144] -sqrdmulh v29.4S, v25.4S, v13.s[0] -mul v25.4S, v25.4S,v16.s[0] -ldr q18, [x0, #48] -sqrdmulh v1.4S, v18.4S, v13.s[0] -mul v18.4S, v18.4S,v16.s[0] -ldr q21, [x17, #+160] -ldr q10, [x17, #+176] -ldr q15, [x0, #96] -sqrdmulh v14.4S, v15.4S, v10.s[0] -mul v15.4S, v15.4S,v21.s[0] -ldr q9, [x0, #112] -sqrdmulh v0.4S, v9.4S, v10.s[0] -mul v9.4S, v9.4S,v21.s[0] -ldr q30, [x17, #+192] -ldr q27, [x17, #+208] -mla v25.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v24.4S, v27.s[0] -ldr q22, [x0, #176] -mla v18.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v22.4S, v27.s[0] -ldr q5, [x17, #+224] -ldr q23, [x17, #+240] -mla v15.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v11.4S, v23.s[0] -ldr q28, [x0, #240] -mla v9.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v28.4S, v23.s[0] -ldr q7, [x0, #0] -ldr q6, [x0, #128] -mul v24.4S, v24.4S,v30.s[0] -sub v8.4s, v7.4s, v25.4s -ldr q26, [x0, #16] -mul v22.4S, v22.4S,v30.s[0] -add v7.4s, v7.4s, v25.4s -ldr q25, [x0, #144] -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v18.4s -ldr q19, [x0, #64] -mla v22.4S, v1.4S, v31.s[0] -add v26.4s, v26.4s, v18.4s -ldr q18, [x0, #192] -mul v11.4S, v11.4S,v5.s[0] -sub v1.4s, v19.4s, v15.4s -ldr q17, [x0, #80] -mul v28.4S, v28.4S,v5.s[0] -add v19.4s, v19.4s, v15.4s -ldr q15, [x0, #208] -mla v11.4S, v14.4S, v31.s[0] -mla v28.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v9.4s -sqrdmulh v14.4S, v26.4S, v13.s[1] -add v17.4s, v17.4s, v9.4s -mul v26.4S, v26.4S,v16.s[1] -sqrdmulh v9.4S, v29.4S, v13.s[2] -sub v20.4s, v6.4s, v24.4s -mul v29.4S, v29.4S,v16.s[2] -add v6.4s, v6.4s, v24.4s -sqrdmulh v13.4S, v17.4S, v10.s[1] -sub v16.4s, v25.4s, v22.4s -mul v17.4S, v17.4S,v21.s[1] -add v25.4s, v25.4s, v22.4s -sqrdmulh v22.4S, v0.4S, v10.s[2] -sub v24.4s, v18.4s, v11.4s -mul v0.4S, v0.4S,v21.s[2] -add v18.4s, v18.4s, v11.4s -mla v26.4S, v14.4S, v31.s[0] -sub v14.4s, v15.4s, v28.4s -ldr q10, [x0, #480] -sqrdmulh v21.4S, v25.4S, v27.s[1] -add v15.4s, v15.4s, v28.4s -mla v29.4S, v9.4S, v31.s[0] -ldr q9, [x0, #416] -sqrdmulh v28.4S, v16.4S, v27.s[2] -sub v11.4s, v7.4s, v26.4s -mla v17.4S, v13.4S, v31.s[0] -ldr q13, [x0, #288] -sqrdmulh v12.4S, v15.4S, v23.s[1] -add v7.4s, v7.4s, v26.4s -str q11, [x0, #16] -mla v0.4S, v22.4S, v31.s[0] -ldr q22, [x17, #+256] -ldr q11, [x17, #+272] -sqrdmulh v26.4S, v14.4S, v23.s[2] -sub v2.4s, v8.4s, v29.4s -str q7, [x0, #0] -mul v25.4S, v25.4S,v30.s[1] -add v8.4s, v8.4s, v29.4s -mul v16.4S, v16.4S,v30.s[2] -str q2, [x0, #48] -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v17.4s -mla v16.4S, v28.4S, v31.s[0] -str q8, [x0, #32] -mul v15.4S, v15.4S,v5.s[1] -str q21, [x0, #80] -mul v14.4S, v14.4S,v5.s[2] -add v19.4s, v19.4s, v17.4s -str q19, [x0, #64] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v0.4s -str q12, [x0, #112] -mla v14.4S, v26.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -str q1, [x0, #96] -sqrdmulh v23.4S, v13.4S, v11.s[0] -sub v5.4s, v6.4s, v25.4s -mul v13.4S, v13.4S,v22.s[0] -str q5, [x0, #144] -ldr q5, [x0, #304] -sqrdmulh v1.4S, v5.4S, v11.s[0] -add v6.4s, v6.4s, v25.4s -mul v5.4S, v5.4S,v22.s[0] -str q6, [x0, #128] -ldr q6, [x17, #+288] -ldr q25, [x17, #+304] -ldr q0, [x0, #352] -sqrdmulh v26.4S, v0.4S, v25.s[0] -sub v12.4s, v20.4s, v16.4s -mul v0.4S, v0.4S,v6.s[0] -str q12, [x0, #176] -ldr q12, [x0, #368] -sqrdmulh v19.4S, v12.4S, v25.s[0] -add v20.4s, v20.4s, v16.4s -mul v12.4S, v12.4S,v6.s[0] -str q20, [x0, #160] -ldr q20, [x17, #+320] -ldr q16, [x17, #+336] -mla v13.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v15.4s -sqrdmulh v17.4S, v9.4S, v16.s[0] -str q23, [x0, #208] -ldr q23, [x0, #432] -mla v5.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v23.4S, v16.s[0] -str q18, [x0, #192] -ldr q18, [x17, #+352] -ldr q1, [x17, #+368] -mla v0.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v14.4s -sqrdmulh v21.4S, v10.4S, v1.s[0] -str q26, [x0, #240] -ldr q26, [x0, #496] -mla v12.4S, v19.4S, v31.s[0] -add v24.4s, v24.4s, v14.4s -sqrdmulh v14.4S, v26.4S, v1.s[0] -str q24, [x0, #224] -ldr q24, [x0, #256] -ldr q19, [x0, #384] -mul v9.4S, v9.4S,v20.s[0] -sub v27.4s, v24.4s, v13.4s -ldr q30, [x0, #272] -mul v23.4S, v23.4S,v20.s[0] -add v24.4s, v24.4s, v13.4s -ldr q13, [x0, #400] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v30.4s, v5.4s -ldr q8, [x0, #320] -mla v23.4S, v15.4S, v31.s[0] -add v30.4s, v30.4s, v5.4s -ldr q5, [x0, #448] -mul v10.4S, v10.4S,v18.s[0] -sub v15.4s, v8.4s, v0.4s -ldr q28, [x0, #336] -mul v26.4S, v26.4S,v18.s[0] -add v8.4s, v8.4s, v0.4s -ldr q0, [x0, #464] -mla v10.4S, v21.4S, v31.s[0] -mla v26.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v12.4s -sqrdmulh v21.4S, v30.4S, v11.s[1] -add v28.4s, v28.4s, v12.4s -mul v30.4S, v30.4S,v22.s[1] -sqrdmulh v12.4S, v17.4S, v11.s[2] -sub v2.4s, v19.4s, v9.4s -mul v17.4S, v17.4S,v22.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v11.4S, v28.4S, v25.s[1] -sub v22.4s, v13.4s, v23.4s -mul v28.4S, v28.4S,v6.s[1] -add v13.4s, v13.4s, v23.4s -sqrdmulh v23.4S, v14.4S, v25.s[2] -sub v9.4s, v5.4s, v10.4s -mul v14.4S, v14.4S,v6.s[2] -add v5.4s, v5.4s, v10.4s -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v26.4s -ldr q25, [x0, #736] -sqrdmulh v6.4S, v13.4S, v16.s[1] -add v0.4s, v0.4s, v26.4s -mla v17.4S, v12.4S, v31.s[0] -ldr q12, [x0, #672] -sqrdmulh v26.4S, v22.4S, v16.s[2] -sub v10.4s, v24.4s, v30.4s -mla v28.4S, v11.4S, v31.s[0] -ldr q11, [x0, #544] -sqrdmulh v29.4S, v0.4S, v1.s[1] -add v24.4s, v24.4s, v30.4s -str q10, [x0, #272] -mla v14.4S, v23.4S, v31.s[0] -ldr q23, [x17, #+384] -ldr q10, [x17, #+400] -sqrdmulh v30.4S, v21.4S, v1.s[2] -sub v7.4s, v27.4s, v17.4s -str q24, [x0, #256] -mul v13.4S, v13.4S,v20.s[1] -add v27.4s, v27.4s, v17.4s -mul v22.4S, v22.4S,v20.s[2] -str q7, [x0, #304] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v8.4s, v28.4s -mla v22.4S, v26.4S, v31.s[0] -str q27, [x0, #288] -mul v0.4S, v0.4S,v18.s[1] -str q6, [x0, #336] -mul v21.4S, v21.4S,v18.s[2] -add v8.4s, v8.4s, v28.4s -str q8, [x0, #320] -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v14.4s -str q29, [x0, #368] -mla v21.4S, v30.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q15, [x0, #352] -sqrdmulh v1.4S, v11.4S, v10.s[0] -sub v18.4s, v19.4s, v13.4s -mul v11.4S, v11.4S,v23.s[0] -str q18, [x0, #400] -ldr q18, [x0, #560] -sqrdmulh v15.4S, v18.4S, v10.s[0] -add v19.4s, v19.4s, v13.4s -mul v18.4S, v18.4S,v23.s[0] -str q19, [x0, #384] -ldr q19, [x17, #+416] -ldr q13, [x17, #+432] -ldr q14, [x0, #608] -sqrdmulh v30.4S, v14.4S, v13.s[0] -sub v29.4s, v2.4s, v22.4s -mul v14.4S, v14.4S,v19.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v8.4S, v29.4S, v13.s[0] -add v2.4s, v2.4s, v22.4s -mul v29.4S, v29.4S,v19.s[0] -str q2, [x0, #416] -ldr q2, [x17, #+448] -ldr q22, [x17, #+464] -mla v11.4S, v1.4S, v31.s[0] -sub v1.4s, v5.4s, v0.4s -sqrdmulh v28.4S, v12.4S, v22.s[0] -str q1, [x0, #464] -ldr q1, [x0, #688] -mla v18.4S, v15.4S, v31.s[0] -add v5.4s, v5.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v22.s[0] -str q5, [x0, #448] -ldr q5, [x17, #+480] -ldr q15, [x17, #+496] -mla v14.4S, v30.4S, v31.s[0] -sub v30.4s, v9.4s, v21.4s -sqrdmulh v6.4S, v25.4S, v15.s[0] -str q30, [x0, #496] -ldr q30, [x0, #752] -mla v29.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v15.s[0] -str q9, [x0, #480] -ldr q9, [x0, #512] -ldr q8, [x0, #640] -mul v12.4S, v12.4S,v2.s[0] -sub v16.4s, v9.4s, v11.4s -ldr q20, [x0, #528] -mul v1.4S, v1.4S,v2.s[0] -add v9.4s, v9.4s, v11.4s -ldr q11, [x0, #656] -mla v12.4S, v28.4S, v31.s[0] -sub v28.4s, v20.4s, v18.4s -ldr q27, [x0, #576] -mla v1.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #704] -mul v25.4S, v25.4S,v5.s[0] -sub v0.4s, v27.4s, v14.4s -ldr q26, [x0, #592] -mul v30.4S, v30.4S,v5.s[0] -add v27.4s, v27.4s, v14.4s -ldr q14, [x0, #720] -mla v25.4S, v6.4S, v31.s[0] -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v26.4s, v29.4s -sqrdmulh v6.4S, v20.4S, v10.s[1] -add v26.4s, v26.4s, v29.4s -mul v20.4S, v20.4S,v23.s[1] -sqrdmulh v29.4S, v28.4S, v10.s[2] -sub v7.4s, v8.4s, v12.4s -mul v28.4S, v28.4S,v23.s[2] -add v8.4s, v8.4s, v12.4s -sqrdmulh v10.4S, v26.4S, v13.s[1] -sub v23.4s, v11.4s, v1.4s -mul v26.4S, v26.4S,v19.s[1] -add v11.4s, v11.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v13.s[2] -sub v12.4s, v18.4s, v25.4s -mul v21.4S, v21.4S,v19.s[2] -add v18.4s, v18.4s, v25.4s -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v14.4s, v30.4s -ldr q13, [x0, #992] -sqrdmulh v19.4S, v11.4S, v22.s[1] -add v14.4s, v14.4s, v30.4s -mla v28.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v30.4S, v23.4S, v22.s[2] -sub v25.4s, v9.4s, v20.4s -mla v26.4S, v10.4S, v31.s[0] -ldr q10, [x0, #800] -sqrdmulh v17.4S, v14.4S, v15.s[1] -add v9.4s, v9.4s, v20.4s -str q25, [x0, #528] -mla v21.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+512] -ldr q25, [x17, #+528] -sqrdmulh v20.4S, v6.4S, v15.s[2] -sub v24.4s, v16.4s, v28.4s -str q9, [x0, #512] -mul v11.4S, v11.4S,v2.s[1] -add v16.4s, v16.4s, v28.4s -mul v23.4S, v23.4S,v2.s[2] -str q24, [x0, #560] -mla v11.4S, v19.4S, v31.s[0] -sub v19.4s, v27.4s, v26.4s -mla v23.4S, v30.4S, v31.s[0] -str q16, [x0, #544] -mul v14.4S, v14.4S,v5.s[1] -str q19, [x0, #592] -mul v6.4S, v6.4S,v5.s[2] -add v27.4s, v27.4s, v26.4s -str q27, [x0, #576] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v21.4s -str q17, [x0, #624] -mla v6.4S, v20.4S, v31.s[0] -add v0.4s, v0.4s, v21.4s -str q0, [x0, #608] -sqrdmulh v15.4S, v10.4S, v25.s[0] -sub v5.4s, v8.4s, v11.4s -mul v10.4S, v10.4S,v1.s[0] -str q5, [x0, #656] -ldr q5, [x0, #816] -sqrdmulh v0.4S, v5.4S, v25.s[0] -add v8.4s, v8.4s, v11.4s -mul v5.4S, v5.4S,v1.s[0] -str q8, [x0, #640] -ldr q8, [x17, #+544] -ldr q11, [x17, #+560] -ldr q21, [x0, #864] -sqrdmulh v20.4S, v21.4S, v11.s[0] -sub v17.4s, v7.4s, v23.4s -mul v21.4S, v21.4S,v8.s[0] -str q17, [x0, #688] -ldr q17, [x0, #880] -sqrdmulh v27.4S, v17.4S, v11.s[0] -add v7.4s, v7.4s, v23.4s -mul v17.4S, v17.4S,v8.s[0] -str q7, [x0, #672] -ldr q7, [x17, #+576] -ldr q23, [x17, #+592] -mla v10.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v14.4s -sqrdmulh v26.4S, v29.4S, v23.s[0] -str q15, [x0, #720] -ldr q15, [x0, #944] -mla v5.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[0] -str q18, [x0, #704] -ldr q18, [x17, #+608] -ldr q0, [x17, #+624] -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v6.4s -sqrdmulh v19.4S, v13.4S, v0.s[0] -str q20, [x0, #752] -ldr q20, [x0, #1008] -mla v17.4S, v27.4S, v31.s[0] -add v12.4s, v12.4s, v6.4s -sqrdmulh v6.4S, v20.4S, v0.s[0] -str q12, [x0, #736] -ldr q12, [x0, #768] -ldr q27, [x0, #896] -mul v29.4S, v29.4S,v7.s[0] -sub v22.4s, v12.4s, v10.4s -ldr q2, [x0, #784] -mul v15.4S, v15.4S,v7.s[0] -add v12.4s, v12.4s, v10.4s -ldr q10, [x0, #912] -mla v29.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v5.4s -ldr q16, [x0, #832] -mla v15.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v5.4s -ldr q5, [x0, #960] -mul v13.4S, v13.4S,v18.s[0] -sub v14.4s, v16.4s, v21.4s -ldr q30, [x0, #848] -mul v20.4S, v20.4S,v18.s[0] -add v16.4s, v16.4s, v21.4s -ldr q21, [x0, #976] -mla v13.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v30.4s, v17.4s -sqrdmulh v19.4S, v2.4S, v25.s[1] -add v30.4s, v30.4s, v17.4s -mul v2.4S, v2.4S,v1.s[1] -sqrdmulh v17.4S, v26.4S, v25.s[2] -sub v24.4s, v27.4s, v29.4s -mul v26.4S, v26.4S,v1.s[2] -add v27.4s, v27.4s, v29.4s -sqrdmulh v25.4S, v30.4S, v11.s[1] -sub v1.4s, v10.4s, v15.4s -mul v30.4S, v30.4S,v8.s[1] -add v10.4s, v10.4s, v15.4s -sqrdmulh v15.4S, v6.4S, v11.s[2] -sub v29.4s, v5.4s, v13.4s -mul v6.4S, v6.4S,v8.s[2] -add v5.4s, v5.4s, v13.4s -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v20.4s -sqrdmulh v11.4S, v10.4S, v23.s[1] -add v21.4s, v21.4s, v20.4s -mla v26.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v1.4S, v23.s[2] -sub v20.4s, v12.4s, v2.4s -mla v30.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v21.4S, v0.s[1] -add v12.4s, v12.4s, v2.4s -str q20, [x0, #784] -mla v6.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v19.4S, v0.s[2] -sub v20.4s, v22.4s, v26.4s -str q12, [x0, #768] -mul v10.4S, v10.4S,v7.s[1] -add v22.4s, v22.4s, v26.4s -mul v1.4S, v1.4S,v7.s[2] -str q20, [x0, #816] -mla v10.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v30.4s -mla v1.4S, v17.4S, v31.s[0] -str q22, [x0, #800] -mul v21.4S, v21.4S,v18.s[1] -str q11, [x0, #848] -mul v19.4S, v19.4S,v18.s[2] -add v16.4s, v16.4s, v30.4s -str q16, [x0, #832] -mla v21.4S, v25.4S, v31.s[0] -sub v25.4s, v14.4s, v6.4s -str q25, [x0, #880] -mla v19.4S, v15.4S, v31.s[0] -add v14.4s, v14.4s, v6.4s -str q14, [x0, #864] -sub v0.4s, v27.4s, v10.4s -str q0, [x0, #912] -add v27.4s, v27.4s, v10.4s -str q27, [x0, #896] -sub v27.4s, v24.4s, v1.4s -str q27, [x0, #944] -add v24.4s, v24.4s, v1.4s -str q24, [x0, #928] -sub v24.4s, v5.4s, v21.4s -str q24, [x0, #976] -add v5.4s, v5.4s, v21.4s -str q5, [x0, #960] -sub v5.4s, v29.4s, v19.4s -str q5, [x0, #1008] -add v29.4s, v29.4s, v19.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1528 -// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s deleted file mode 100644 index 203055c..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_20_z4_7.s +++ /dev/null @@ -1,1558 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_20_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -ldr q3, [x0, #416] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q13, [x0, #608] -sub v13.4s, v3.4s, v12.4s -ldr q18, [x0, #752] -sqrdmulh v19.4S, v18.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -str q24, [x0, #544] -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #688] -sqrdmulh v24.4S, v12.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -str q13, [x0, #480] -sub v13.4s, v17.4s, v27.4s -ldr q8, [x0, #624] -sqrdmulh v28.4S, v8.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -str q3, [x0, #416] -add v17.4s, v17.4s, v27.4s -ldr q27, [x0, #560] -sqrdmulh v3.4S, v27.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -str q13, [x0, #352] -sub v13.4s, v10.4s, v14.4s -ldr q26, [x0, #496] -ldr q6, [x0, #432] -mul v12.4S, v12.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q17, [x0, #288] -add v10.4s, v10.4s, v14.4s -ldr q14, [x0, #368] -ldr q17, [x0, #304] -mla v12.4S, v24.4S, v31.s[0] -mla v18.4S, v19.4S, v31.s[0] -str q13, [x0, #224] -sub v13.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q24, [x0, #176] -mul v27.4S, v27.4S,v30.s[0] -mul v8.4S, v8.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v27.4S, v3.4S, v31.s[0] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v3.4s, v6.4s, v21.4s -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v25.4s, v14.4s, v11.4s -add v14.4s, v14.4s, v11.4s -sqrdmulh v11.4S, v26.4S, v29.s[1] -mul v26.4S, v26.4S,v30.s[1] -sub v5.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v4.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v12.4s -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v5.4S, v29.s[2] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v14.4S, v29.s[1] -mla v26.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v27.4s -str q13, [x0, #96] -sqrdmulh v13.4S, v17.4S, v29.s[1] -mla v6.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v27.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v12.4S, v31.s[0] -mla v25.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v3.4s -add v20.4s, v20.4s, v3.4s -mul v17.4S, v17.4S,v30.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v3.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -mla v17.4S, v13.4S, v31.s[0] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v24.4s, v6.4s -add v24.4s, v24.4s, v6.4s -sqrdmulh v6.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v13.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v26.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v3.4S, v22.s[1] -mul v3.4S, v3.4S,v23.s[1] -sub v12.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v22.s[3] -mla v15.4S, v6.4S, v31.s[0] -nop -nop -sqrdmulh v6.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v8.4S, v22.s[1] -mla v3.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v24.4S, v22.s[0] -mla v19.4S, v14.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v18.4S, v18.4S,v23.s[3] -sub v14.4s, v13.4s, v15.4s -add v13.4s, v13.4s, v15.4s -mla v20.4S, v6.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v24.4S, v24.4S,v23.s[0] -mul v8.4S, v8.4S,v23.s[1] -sub v4.4s, v12.4s, v3.4s -add v12.4s, v12.4s, v3.4s -mla v24.4S, v5.4S, v31.s[0] -mla v8.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v9.s[3] -mul v14.4S, v14.4S,v1.s[3] -sub v5.4s, v26.4s, v18.4s -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v13.4S, v9.s[2] -mul v13.4S, v13.4S,v1.s[2] -sub v3.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v6.4s, v28.4s, v8.4s -add v28.4s, v28.4s, v8.4s -sqrdmulh v8.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v24.4s -add v10.4s, v10.4s, v24.4s -sqrdmulh v24.4S, v4.4S, v7.s[3] -mla v14.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v12.4S, v7.s[2] -mla v13.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v25.4S, v7.s[1] -mla v17.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v8.4S, v31.s[0] -nop -nop -mul v12.4S, v12.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v8.4s, v5.4s, v14.4s -str q8, [x0, #1008] -mla v12.4S, v19.4S, v31.s[0] -mla v4.4S, v24.4S, v31.s[0] -add v5.4s, v5.4s, v14.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v26.4s, v13.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v18.4S, v31.s[0] -add v26.4s, v26.4s, v13.4s -sub v13.4s, v3.4s, v17.4s -ldr q18, [x0, #960] -sqrdmulh v20.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -add v3.4s, v3.4s, v17.4s -str q26, [x0, #816] -ldr q26, [x0, #896] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q13, [x0, #752] -ldr q13, [x0, #832] -sqrdmulh v14.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q3, [x0, #688] -ldr q3, [x0, #768] -sqrdmulh v21.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -str q5, [x0, #624] -sub v5.4s, v6.4s, v4.4s -ldr q24, [x0, #704] -sqrdmulh v19.4S, v24.4S, v29.s[0] -mla v18.4S, v20.4S, v31.s[0] -str q11, [x0, #560] -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #640] -sqrdmulh v11.4S, v4.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -str q5, [x0, #496] -sub v5.4s, v28.4s, v12.4s -ldr q17, [x0, #576] -sqrdmulh v20.4S, v17.4S, v29.s[0] -mla v13.4S, v14.4S, v31.s[0] -str q6, [x0, #432] -add v28.4s, v28.4s, v12.4s -ldr q12, [x0, #512] -sqrdmulh v6.4S, v12.4S, v29.s[0] -mla v3.4S, v21.4S, v31.s[0] -str q5, [x0, #368] -sub v5.4s, v15.4s, v25.4s -ldr q21, [x0, #448] -ldr q14, [x0, #384] -mul v4.4S, v4.4S,v30.s[0] -mul v24.4S, v24.4S,v30.s[0] -str q28, [x0, #304] -add v15.4s, v15.4s, v25.4s -ldr q25, [x0, #320] -ldr q28, [x0, #256] -mla v4.4S, v11.4S, v31.s[0] -mla v24.4S, v19.4S, v31.s[0] -str q5, [x0, #240] -sub v5.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q11, [x0, #128] -mul v12.4S, v12.4S,v30.s[0] -mul v17.4S, v17.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v12.4S, v6.4S, v31.s[0] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v18.4s -add v21.4s, v21.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v6.4s, v14.4s, v26.4s -add v14.4s, v14.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[2] -mul v6.4S, v6.4S,v30.s[2] -sub v8.4s, v25.4s, v13.4s -add v25.4s, v25.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v27.4s, v28.4s, v3.4s -add v28.4s, v28.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v16.4s, v19.4s, v24.4s -add v19.4s, v19.4s, v24.4s -sqrdmulh v24.4S, v8.4S, v29.s[2] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v4.4s -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v27.4S, v29.s[2] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v29.s[1] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v12.4s -str q5, [x0, #112] -sqrdmulh v5.4S, v28.4S, v29.s[1] -mla v14.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v12.4s -str q10, [x0, #48] -mul v27.4S, v27.4S,v30.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v27.4S, v4.4S, v31.s[0] -mla v8.4S, v24.4S, v31.s[0] -sub v24.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -mul v28.4S, v28.4S,v30.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v6.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v28.4S, v5.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v5.4s, v26.4s, v8.4s -add v26.4s, v26.4s, v8.4s -sqrdmulh v8.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v21.4s, v13.4s, v27.4s -add v13.4s, v13.4s, v27.4s -sqrdmulh v27.4S, v6.4S, v22.s[1] -mul v6.4S, v6.4S,v23.s[1] -sub v4.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v28.4s -add v15.4s, v15.4s, v28.4s -sqrdmulh v28.4S, v24.4S, v22.s[3] -mla v10.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v18.4S, v22.s[2] -mla v16.4S, v8.4S, v31.s[0] -nop -nop -sqrdmulh v8.4S, v17.4S, v22.s[1] -mla v6.4S, v27.4S, v31.s[0] -nop -nop -sqrdmulh v27.4S, v11.4S, v22.s[0] -mla v19.4S, v25.4S, v31.s[0] -nop -nop -mul v18.4S, v18.4S,v23.s[2] -mul v24.4S, v24.4S,v23.s[3] -sub v25.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -mla v18.4S, v14.4S, v31.s[0] -mla v24.4S, v28.4S, v31.s[0] -sub v28.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -mul v11.4S, v11.4S,v23.s[0] -mul v17.4S, v17.4S,v23.s[1] -sub v16.4s, v4.4s, v6.4s -add v4.4s, v4.4s, v6.4s -mla v11.4S, v27.4S, v31.s[0] -mla v17.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v9.s[3] -mul v25.4S, v25.4S,v1.s[3] -sub v27.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v9.s[2] -mul v5.4S, v5.4S,v1.s[2] -sub v6.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v7.s[3] -mla v25.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v4.4S, v7.s[2] -mla v5.4S, v24.4S, v31.s[0] -nop -nop -sqrdmulh v24.4S, v8.4S, v7.s[1] -mla v28.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v0.4S, v7.s[0] -mla v26.4S, v17.4S, v31.s[0] -nop -nop -mul v4.4S, v4.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v17.4s, v27.4s, v25.4s -str q17, [x0, #960] -mla v4.4S, v19.4S, v31.s[0] -mla v16.4S, v11.4S, v31.s[0] -add v27.4s, v27.4s, v25.4s -str q27, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v8.4S, v8.4S,v2.s[1] -sub v27.4s, v21.4s, v5.4s -str q27, [x0, #832] -mla v0.4S, v18.4S, v31.s[0] -mla v8.4S, v24.4S, v31.s[0] -add v21.4s, v21.4s, v5.4s -sub v5.4s, v6.4s, v28.4s -ldr q24, [x0, #976] -sqrdmulh v18.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -add v6.4s, v6.4s, v28.4s -str q21, [x0, #768] -ldr q21, [x0, #912] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v27.4s, v13.4s, v26.4s -str q5, [x0, #704] -ldr q5, [x0, #848] -sqrdmulh v25.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v13.4s, v13.4s, v26.4s -str q6, [x0, #640] -ldr q6, [x0, #784] -sqrdmulh v26.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -str q27, [x0, #576] -sub v27.4s, v14.4s, v16.4s -ldr q11, [x0, #720] -sqrdmulh v19.4S, v11.4S, v29.s[0] -mla v24.4S, v18.4S, v31.s[0] -str q13, [x0, #512] -add v14.4s, v14.4s, v16.4s -ldr q16, [x0, #656] -sqrdmulh v13.4S, v16.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -str q27, [x0, #448] -sub v27.4s, v20.4s, v4.4s -ldr q28, [x0, #592] -sqrdmulh v18.4S, v28.4S, v29.s[0] -mla v5.4S, v25.4S, v31.s[0] -str q14, [x0, #384] -add v20.4s, v20.4s, v4.4s -ldr q4, [x0, #528] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mla v6.4S, v26.4S, v31.s[0] -str q27, [x0, #320] -sub v27.4s, v10.4s, v8.4s -ldr q26, [x0, #464] -ldr q25, [x0, #400] -mul v16.4S, v16.4S,v30.s[0] -mul v11.4S, v11.4S,v30.s[0] -str q20, [x0, #256] -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #336] -ldr q20, [x0, #272] -mla v16.4S, v13.4S, v31.s[0] -mla v11.4S, v19.4S, v31.s[0] -str q27, [x0, #192] -sub v27.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q13, [x0, #144] -mul v4.4S, v4.4S,v30.s[0] -mul v28.4S, v28.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v4.4S, v14.4S, v31.s[0] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v14.4s, v25.4s, v21.4s -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v17.4s, v8.4s, v5.4s -add v8.4s, v8.4s, v5.4s -sqrdmulh v5.4S, v26.4S, v29.s[1] -mul v26.4S, v26.4S,v30.s[1] -sub v12.4s, v20.4s, v6.4s -add v20.4s, v20.4s, v6.4s -sqrdmulh v6.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v3.4s, v19.4s, v11.4s -add v19.4s, v19.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[2] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[2] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v29.s[1] -mla v26.4S, v5.4S, v31.s[0] -sub v5.4s, v10.4s, v4.4s -str q27, [x0, #64] -sqrdmulh v27.4S, v20.4S, v29.s[1] -mla v25.4S, v6.4S, v31.s[0] -add v10.4s, v10.4s, v4.4s -str q15, [x0, #0] -mul v12.4S, v12.4S,v30.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -mla v12.4S, v16.4S, v31.s[0] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v24.4s, v14.4s -add v24.4s, v24.4s, v14.4s -mul v20.4S, v20.4S,v30.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v14.4s, v19.4s, v26.4s -add v19.4s, v19.4s, v26.4s -mla v20.4S, v27.4S, v31.s[0] -mla v8.4S, v28.4S, v31.s[0] -sub v28.4s, v13.4s, v25.4s -add v13.4s, v13.4s, v25.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v21.4s, v17.4s -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v22.s[2] -mul v3.4S, v3.4S,v23.s[2] -sub v25.4s, v5.4s, v12.4s -add v5.4s, v5.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v22.s[1] -mul v14.4S, v14.4S,v23.s[1] -sub v27.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v26.4s, v10.4s, v20.4s -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v24.4S, v22.s[2] -mla v3.4S, v17.4S, v31.s[0] -nop -nop -sqrdmulh v17.4S, v28.4S, v22.s[1] -mla v14.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v13.4S, v22.s[0] -mla v19.4S, v8.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v23.s[2] -mul v11.4S, v11.4S,v23.s[3] -sub v8.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v24.4S, v29.4S, v31.s[0] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v21.4s, v3.4s -add v21.4s, v21.4s, v3.4s -mul v13.4S, v13.4S,v23.s[0] -mul v28.4S, v28.4S,v23.s[1] -sub v3.4s, v27.4s, v14.4s -add v27.4s, v27.4s, v14.4s -mla v13.4S, v12.4S, v31.s[0] -mla v28.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v8.4S, v9.s[3] -mul v8.4S, v8.4S,v1.s[3] -sub v23.4s, v25.4s, v11.4s -add v25.4s, v25.4s, v11.4s -sqrdmulh v11.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v5.4s, v24.4s -add v5.4s, v5.4s, v24.4s -sqrdmulh v24.4S, v20.4S, v9.s[1] -mul v20.4S, v20.4S,v1.s[1] -sub v12.4s, v26.4s, v28.4s -add v26.4s, v26.4s, v28.4s -sqrdmulh v28.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v14.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v9.4S, v3.4S, v7.s[3] -mla v8.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v27.4S, v7.s[2] -mla v30.4S, v11.4S, v31.s[0] -nop -nop -sqrdmulh v11.4S, v17.4S, v7.s[1] -mla v20.4S, v24.4S, v31.s[0] -nop -nop -sqrdmulh v24.4S, v0.4S, v7.s[0] -mla v21.4S, v28.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v3.4S, v3.4S,v2.s[3] -sub v28.4s, v23.4s, v8.4s -str q28, [x0, #976] -mla v27.4S, v22.4S, v31.s[0] -mla v3.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v8.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v17.4S, v17.4S,v2.s[1] -sub v23.4s, v25.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v24.4S, v31.s[0] -mla v17.4S, v11.4S, v31.s[0] -add v25.4s, v25.4s, v30.4s -sub v30.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -str q25, [x0, #784] -sub v25.4s, v5.4s, v21.4s -str q30, [x0, #720] -add v5.4s, v5.4s, v21.4s -str q19, [x0, #656] -str q25, [x0, #592] -sub v25.4s, v12.4s, v3.4s -str q5, [x0, #528] -add v12.4s, v12.4s, v3.4s -str q25, [x0, #464] -sub v25.4s, v26.4s, v27.4s -str q12, [x0, #400] -add v26.4s, v26.4s, v27.4s -str q25, [x0, #336] -sub v25.4s, v14.4s, v17.4s -str q26, [x0, #272] -add v14.4s, v14.4s, v17.4s -str q25, [x0, #208] -sub v25.4s, v10.4s, v0.4s -str q14, [x0, #144] -add v10.4s, v10.4s, v0.4s -str q25, [x0, #80] -str q10, [x0, #16] -ldr q6, [x0, #224] -ldr q4, [x0, #160] -ldr q18, [x0, #32] -ldr q16, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v29.4S, v18.4S, v15.s[0] -mul v18.4S, v18.4S,v16.s[0] -ldr q13, [x0, #48] -sqrdmulh v1.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v16.s[0] -ldr q28, [x17, #+160] -ldr q22, [x17, #+176] -ldr q9, [x0, #96] -sqrdmulh v8.4S, v9.4S, v22.s[0] -mul v9.4S, v9.4S,v28.s[0] -ldr q23, [x0, #112] -sqrdmulh v24.4S, v23.4S, v22.s[0] -mul v23.4S, v23.4S,v28.s[0] -ldr q11, [x17, #+192] -ldr q2, [x17, #+208] -mla v18.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v4.4S, v2.s[0] -ldr q7, [x0, #176] -mla v13.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v7.4S, v2.s[0] -ldr q20, [x17, #+224] -ldr q30, [x17, #+240] -mla v9.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v6.4S, v30.s[0] -ldr q21, [x0, #240] -mla v23.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v21.4S, v30.s[0] -ldr q19, [x0, #0] -ldr q5, [x0, #128] -mul v4.4S, v4.4S,v11.s[0] -sub v3.4s, v19.4s, v18.4s -ldr q12, [x0, #16] -mul v7.4S, v7.4S,v11.s[0] -add v19.4s, v19.4s, v18.4s -ldr q18, [x0, #144] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v12.4s, v13.4s -ldr q27, [x0, #64] -mla v7.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v13.4s -ldr q13, [x0, #192] -mul v6.4S, v6.4S,v20.s[0] -sub v1.4s, v27.4s, v9.4s -ldr q26, [x0, #80] -mul v21.4S, v21.4S,v20.s[0] -add v27.4s, v27.4s, v9.4s -ldr q9, [x0, #208] -mla v6.4S, v8.4S, v31.s[0] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v23.4s -sqrdmulh v8.4S, v12.4S, v15.s[1] -add v26.4s, v26.4s, v23.4s -mul v12.4S, v12.4S,v16.s[1] -sqrdmulh v23.4S, v29.4S, v15.s[2] -sub v17.4s, v5.4s, v4.4s -mul v29.4S, v29.4S,v16.s[2] -add v5.4s, v5.4s, v4.4s -sqrdmulh v15.4S, v26.4S, v22.s[1] -sub v16.4s, v18.4s, v7.4s -mul v26.4S, v26.4S,v28.s[1] -add v18.4s, v18.4s, v7.4s -sqrdmulh v7.4S, v24.4S, v22.s[2] -sub v4.4s, v13.4s, v6.4s -mul v24.4S, v24.4S,v28.s[2] -add v13.4s, v13.4s, v6.4s -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v9.4s, v21.4s -ldr q22, [x0, #480] -sqrdmulh v28.4S, v18.4S, v2.s[1] -add v9.4s, v9.4s, v21.4s -mla v29.4S, v23.4S, v31.s[0] -ldr q23, [x0, #416] -sqrdmulh v21.4S, v16.4S, v2.s[2] -sub v6.4s, v19.4s, v12.4s -mla v26.4S, v15.4S, v31.s[0] -ldr q15, [x0, #288] -sqrdmulh v14.4S, v9.4S, v30.s[1] -add v19.4s, v19.4s, v12.4s -str q6, [x0, #16] -mla v24.4S, v7.4S, v31.s[0] -ldr q7, [x17, #+256] -ldr q6, [x17, #+272] -sqrdmulh v12.4S, v8.4S, v30.s[2] -sub v0.4s, v3.4s, v29.4s -str q19, [x0, #0] -mul v18.4S, v18.4S,v11.s[1] -add v3.4s, v3.4s, v29.4s -mul v16.4S, v16.4S,v11.s[2] -str q0, [x0, #48] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v26.4s -mla v16.4S, v21.4S, v31.s[0] -str q3, [x0, #32] -mul v9.4S, v9.4S,v20.s[1] -str q28, [x0, #80] -mul v8.4S, v8.4S,v20.s[2] -add v27.4s, v27.4s, v26.4s -str q27, [x0, #64] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v24.4s -str q14, [x0, #112] -mla v8.4S, v12.4S, v31.s[0] -add v1.4s, v1.4s, v24.4s -str q1, [x0, #96] -sqrdmulh v30.4S, v15.4S, v6.s[0] -sub v20.4s, v5.4s, v18.4s -mul v15.4S, v15.4S,v7.s[0] -str q20, [x0, #144] -ldr q20, [x0, #304] -sqrdmulh v1.4S, v20.4S, v6.s[0] -add v5.4s, v5.4s, v18.4s -mul v20.4S, v20.4S,v7.s[0] -str q5, [x0, #128] -ldr q5, [x17, #+288] -ldr q18, [x17, #+304] -ldr q24, [x0, #352] -sqrdmulh v12.4S, v24.4S, v18.s[0] -sub v14.4s, v17.4s, v16.4s -mul v24.4S, v24.4S,v5.s[0] -str q14, [x0, #176] -ldr q14, [x0, #368] -sqrdmulh v27.4S, v14.4S, v18.s[0] -add v17.4s, v17.4s, v16.4s -mul v14.4S, v14.4S,v5.s[0] -str q17, [x0, #160] -ldr q17, [x17, #+320] -ldr q16, [x17, #+336] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v13.4s, v9.4s -sqrdmulh v26.4S, v23.4S, v16.s[0] -str q30, [x0, #208] -ldr q30, [x0, #432] -mla v20.4S, v1.4S, v31.s[0] -add v13.4s, v13.4s, v9.4s -sqrdmulh v9.4S, v30.4S, v16.s[0] -str q13, [x0, #192] -ldr q13, [x17, #+352] -ldr q1, [x17, #+368] -mla v24.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v8.4s -sqrdmulh v28.4S, v22.4S, v1.s[0] -str q12, [x0, #240] -ldr q12, [x0, #496] -mla v14.4S, v27.4S, v31.s[0] -add v4.4s, v4.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v1.s[0] -str q4, [x0, #224] -ldr q4, [x0, #256] -ldr q27, [x0, #384] -mul v23.4S, v23.4S,v17.s[0] -sub v2.4s, v4.4s, v15.4s -ldr q11, [x0, #272] -mul v30.4S, v30.4S,v17.s[0] -add v4.4s, v4.4s, v15.4s -ldr q15, [x0, #400] -mla v23.4S, v26.4S, v31.s[0] -sub v26.4s, v11.4s, v20.4s -ldr q3, [x0, #320] -mla v30.4S, v9.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -ldr q20, [x0, #448] -mul v22.4S, v22.4S,v13.s[0] -sub v9.4s, v3.4s, v24.4s -ldr q21, [x0, #336] -mul v12.4S, v12.4S,v13.s[0] -add v3.4s, v3.4s, v24.4s -ldr q24, [x0, #464] -mla v22.4S, v28.4S, v31.s[0] -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v21.4s, v14.4s -sqrdmulh v28.4S, v11.4S, v6.s[1] -add v21.4s, v21.4s, v14.4s -mul v11.4S, v11.4S,v7.s[1] -sqrdmulh v14.4S, v26.4S, v6.s[2] -sub v0.4s, v27.4s, v23.4s -mul v26.4S, v26.4S,v7.s[2] -add v27.4s, v27.4s, v23.4s -sqrdmulh v6.4S, v21.4S, v18.s[1] -sub v7.4s, v15.4s, v30.4s -mul v21.4S, v21.4S,v5.s[1] -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v8.4S, v18.s[2] -sub v23.4s, v20.4s, v22.4s -mul v8.4S, v8.4S,v5.s[2] -add v20.4s, v20.4s, v22.4s -mla v11.4S, v28.4S, v31.s[0] -sub v28.4s, v24.4s, v12.4s -ldr q18, [x0, #736] -sqrdmulh v5.4S, v15.4S, v16.s[1] -add v24.4s, v24.4s, v12.4s -mla v26.4S, v14.4S, v31.s[0] -ldr q14, [x0, #672] -sqrdmulh v12.4S, v7.4S, v16.s[2] -sub v22.4s, v4.4s, v11.4s -mla v21.4S, v6.4S, v31.s[0] -ldr q6, [x0, #544] -sqrdmulh v29.4S, v24.4S, v1.s[1] -add v4.4s, v4.4s, v11.4s -str q22, [x0, #272] -mla v8.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+384] -ldr q22, [x17, #+400] -sqrdmulh v11.4S, v28.4S, v1.s[2] -sub v19.4s, v2.4s, v26.4s -str q4, [x0, #256] -mul v15.4S, v15.4S,v17.s[1] -add v2.4s, v2.4s, v26.4s -mul v7.4S, v7.4S,v17.s[2] -str q19, [x0, #304] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v21.4s -mla v7.4S, v12.4S, v31.s[0] -str q2, [x0, #288] -mul v24.4S, v24.4S,v13.s[1] -str q5, [x0, #336] -mul v28.4S, v28.4S,v13.s[2] -add v3.4s, v3.4s, v21.4s -str q3, [x0, #320] -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v9.4s, v8.4s -str q29, [x0, #368] -mla v28.4S, v11.4S, v31.s[0] -add v9.4s, v9.4s, v8.4s -str q9, [x0, #352] -sqrdmulh v1.4S, v6.4S, v22.s[0] -sub v13.4s, v27.4s, v15.4s -mul v6.4S, v6.4S,v30.s[0] -str q13, [x0, #400] -ldr q13, [x0, #560] -sqrdmulh v9.4S, v13.4S, v22.s[0] -add v27.4s, v27.4s, v15.4s -mul v13.4S, v13.4S,v30.s[0] -str q27, [x0, #384] -ldr q27, [x17, #+416] -ldr q15, [x17, #+432] -ldr q8, [x0, #608] -sqrdmulh v11.4S, v8.4S, v15.s[0] -sub v29.4s, v0.4s, v7.4s -mul v8.4S, v8.4S,v27.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v3.4S, v29.4S, v15.s[0] -add v0.4s, v0.4s, v7.4s -mul v29.4S, v29.4S,v27.s[0] -str q0, [x0, #416] -ldr q0, [x17, #+448] -ldr q7, [x17, #+464] -mla v6.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v24.4s -sqrdmulh v21.4S, v14.4S, v7.s[0] -str q1, [x0, #464] -ldr q1, [x0, #688] -mla v13.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v24.4s -sqrdmulh v24.4S, v1.4S, v7.s[0] -str q20, [x0, #448] -ldr q20, [x17, #+480] -ldr q9, [x17, #+496] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v23.4s, v28.4s -sqrdmulh v5.4S, v18.4S, v9.s[0] -str q11, [x0, #496] -ldr q11, [x0, #752] -mla v29.4S, v3.4S, v31.s[0] -add v23.4s, v23.4s, v28.4s -sqrdmulh v28.4S, v11.4S, v9.s[0] -str q23, [x0, #480] -ldr q23, [x0, #512] -ldr q3, [x0, #640] -mul v14.4S, v14.4S,v0.s[0] -sub v16.4s, v23.4s, v6.4s -ldr q17, [x0, #528] -mul v1.4S, v1.4S,v0.s[0] -add v23.4s, v23.4s, v6.4s -ldr q6, [x0, #656] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v13.4s -ldr q2, [x0, #576] -mla v1.4S, v24.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -ldr q13, [x0, #704] -mul v18.4S, v18.4S,v20.s[0] -sub v24.4s, v2.4s, v8.4s -ldr q12, [x0, #592] -mul v11.4S, v11.4S,v20.s[0] -add v2.4s, v2.4s, v8.4s -ldr q8, [x0, #720] -mla v18.4S, v5.4S, v31.s[0] -mla v11.4S, v28.4S, v31.s[0] -sub v28.4s, v12.4s, v29.4s -sqrdmulh v5.4S, v17.4S, v22.s[1] -add v12.4s, v12.4s, v29.4s -mul v17.4S, v17.4S,v30.s[1] -sqrdmulh v29.4S, v21.4S, v22.s[2] -sub v19.4s, v3.4s, v14.4s -mul v21.4S, v21.4S,v30.s[2] -add v3.4s, v3.4s, v14.4s -sqrdmulh v22.4S, v12.4S, v15.s[1] -sub v30.4s, v6.4s, v1.4s -mul v12.4S, v12.4S,v27.s[1] -add v6.4s, v6.4s, v1.4s -sqrdmulh v1.4S, v28.4S, v15.s[2] -sub v14.4s, v13.4s, v18.4s -mul v28.4S, v28.4S,v27.s[2] -add v13.4s, v13.4s, v18.4s -mla v17.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v11.4s -ldr q15, [x0, #992] -sqrdmulh v27.4S, v6.4S, v7.s[1] -add v8.4s, v8.4s, v11.4s -mla v21.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v11.4S, v30.4S, v7.s[2] -sub v18.4s, v23.4s, v17.4s -mla v12.4S, v22.4S, v31.s[0] -ldr q22, [x0, #800] -sqrdmulh v26.4S, v8.4S, v9.s[1] -add v23.4s, v23.4s, v17.4s -str q18, [x0, #528] -mla v28.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+512] -ldr q18, [x17, #+528] -sqrdmulh v17.4S, v5.4S, v9.s[2] -sub v4.4s, v16.4s, v21.4s -str q23, [x0, #512] -mul v6.4S, v6.4S,v0.s[1] -add v16.4s, v16.4s, v21.4s -mul v30.4S, v30.4S,v0.s[2] -str q4, [x0, #560] -mla v6.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v12.4s -mla v30.4S, v11.4S, v31.s[0] -str q16, [x0, #544] -mul v8.4S, v8.4S,v20.s[1] -str q27, [x0, #592] -mul v5.4S, v5.4S,v20.s[2] -add v2.4s, v2.4s, v12.4s -str q2, [x0, #576] -mla v8.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v28.4s -str q26, [x0, #624] -mla v5.4S, v17.4S, v31.s[0] -add v24.4s, v24.4s, v28.4s -str q24, [x0, #608] -sqrdmulh v9.4S, v22.4S, v18.s[0] -sub v20.4s, v3.4s, v6.4s -mul v22.4S, v22.4S,v1.s[0] -str q20, [x0, #656] -ldr q20, [x0, #816] -sqrdmulh v24.4S, v20.4S, v18.s[0] -add v3.4s, v3.4s, v6.4s -mul v20.4S, v20.4S,v1.s[0] -str q3, [x0, #640] -ldr q3, [x17, #+544] -ldr q6, [x17, #+560] -ldr q28, [x0, #864] -sqrdmulh v17.4S, v28.4S, v6.s[0] -sub v26.4s, v19.4s, v30.4s -mul v28.4S, v28.4S,v3.s[0] -str q26, [x0, #688] -ldr q26, [x0, #880] -sqrdmulh v2.4S, v26.4S, v6.s[0] -add v19.4s, v19.4s, v30.4s -mul v26.4S, v26.4S,v3.s[0] -str q19, [x0, #672] -ldr q19, [x17, #+576] -ldr q30, [x17, #+592] -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v13.4s, v8.4s -sqrdmulh v12.4S, v29.4S, v30.s[0] -str q9, [x0, #720] -ldr q9, [x0, #944] -mla v20.4S, v24.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v9.4S, v30.s[0] -str q13, [x0, #704] -ldr q13, [x17, #+608] -ldr q24, [x17, #+624] -mla v28.4S, v17.4S, v31.s[0] -sub v17.4s, v14.4s, v5.4s -sqrdmulh v27.4S, v15.4S, v24.s[0] -str q17, [x0, #752] -ldr q17, [x0, #1008] -mla v26.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v5.4s -sqrdmulh v5.4S, v17.4S, v24.s[0] -str q14, [x0, #736] -ldr q14, [x0, #768] -ldr q2, [x0, #896] -mul v29.4S, v29.4S,v19.s[0] -sub v7.4s, v14.4s, v22.4s -ldr q0, [x0, #784] -mul v9.4S, v9.4S,v19.s[0] -add v14.4s, v14.4s, v22.4s -ldr q22, [x0, #912] -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v20.4s -ldr q16, [x0, #832] -mla v9.4S, v8.4S, v31.s[0] -add v0.4s, v0.4s, v20.4s -ldr q20, [x0, #960] -mul v15.4S, v15.4S,v13.s[0] -sub v8.4s, v16.4s, v28.4s -ldr q11, [x0, #848] -mul v17.4S, v17.4S,v13.s[0] -add v16.4s, v16.4s, v28.4s -ldr q28, [x0, #976] -mla v15.4S, v27.4S, v31.s[0] -mla v17.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v26.4s -sqrdmulh v27.4S, v0.4S, v18.s[1] -add v11.4s, v11.4s, v26.4s -mul v0.4S, v0.4S,v1.s[1] -sqrdmulh v26.4S, v12.4S, v18.s[2] -sub v4.4s, v2.4s, v29.4s -mul v12.4S, v12.4S,v1.s[2] -add v2.4s, v2.4s, v29.4s -sqrdmulh v18.4S, v11.4S, v6.s[1] -sub v1.4s, v22.4s, v9.4s -mul v11.4S, v11.4S,v3.s[1] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v5.4S, v6.s[2] -sub v29.4s, v20.4s, v15.4s -mul v5.4S, v5.4S,v3.s[2] -add v20.4s, v20.4s, v15.4s -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v28.4s, v17.4s -sqrdmulh v6.4S, v22.4S, v30.s[1] -add v28.4s, v28.4s, v17.4s -mla v12.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v1.4S, v30.s[2] -sub v17.4s, v14.4s, v0.4s -mla v11.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v28.4S, v24.s[1] -add v14.4s, v14.4s, v0.4s -str q17, [x0, #784] -mla v5.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v27.4S, v24.s[2] -sub v17.4s, v7.4s, v12.4s -str q14, [x0, #768] -mul v22.4S, v22.4S,v19.s[1] -add v7.4s, v7.4s, v12.4s -mul v1.4S, v1.4S,v19.s[2] -str q17, [x0, #816] -mla v22.4S, v6.4S, v31.s[0] -sub v6.4s, v16.4s, v11.4s -mla v1.4S, v26.4S, v31.s[0] -str q7, [x0, #800] -mul v28.4S, v28.4S,v13.s[1] -str q6, [x0, #848] -mul v27.4S, v27.4S,v13.s[2] -add v16.4s, v16.4s, v11.4s -str q16, [x0, #832] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v8.4s, v5.4s -str q18, [x0, #880] -mla v27.4S, v9.4S, v31.s[0] -add v8.4s, v8.4s, v5.4s -str q8, [x0, #864] -sub v24.4s, v2.4s, v22.4s -str q24, [x0, #912] -add v2.4s, v2.4s, v22.4s -str q2, [x0, #896] -sub v2.4s, v4.4s, v1.4s -str q2, [x0, #944] -add v4.4s, v4.4s, v1.4s -str q4, [x0, #928] -sub v4.4s, v20.4s, v28.4s -str q4, [x0, #976] -add v20.4s, v20.4s, v28.4s -str q20, [x0, #960] -sub v20.4s, v29.4s, v27.4s -str q20, [x0, #1008] -add v29.4s, v29.4s, v27.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1528 -// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s deleted file mode 100644 index 3e72693..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_21_z4_7.s +++ /dev/null @@ -1,1558 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_21_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -ldr q3, [x0, #416] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q13, [x0, #608] -sub v13.4s, v3.4s, v12.4s -ldr q18, [x0, #752] -sqrdmulh v19.4S, v18.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q13, [x0, #480] -ldr q13, [x0, #624] -sqrdmulh v28.4S, v13.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -ldr q6, [x0, #432] -mul v24.4S, v24.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -add v10.4s, v10.4s, v14.4s -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q14, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v18.4S, v19.4S, v31.s[0] -sub v19.4s, v15.4s, v0.4s -str q26, [x0, #224] -ldr q26, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v15.4s, v15.4s, v0.4s -str q10, [x0, #160] -ldr q10, [x0, #112] -ldr q0, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v13.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v6.4s, v21.4s -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v14.4s, v16.4s -add v14.4s, v14.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v4.4s, v26.4s, v18.4s -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -str q19, [x0, #96] -sub v19.4s, v0.4s, v3.4s -sqrdmulh v11.4S, v14.4S, v29.s[1] -mla v6.4S, v16.4S, v31.s[0] -str q15, [x0, #32] -add v0.4s, v0.4s, v3.4s -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v3.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v14.4S, v14.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v26.4s, v8.4s -add v26.4s, v26.4s, v8.4s -mla v14.4S, v11.4S, v31.s[0] -mla v17.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v6.4s -add v12.4s, v12.4s, v6.4s -sqrdmulh v6.4S, v3.4S, v22.s[3] -mul v3.4S, v3.4S,v23.s[3] -sub v11.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v26.4S, v22.s[0] -mul v26.4S, v26.4S,v23.s[0] -sub v28.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v22.s[3] -mla v3.4S, v6.4S, v31.s[0] -nop -nop -sqrdmulh v6.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v26.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v18.4S, v18.4S,v23.s[3] -sub v17.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -mla v20.4S, v6.4S, v31.s[0] -mla v18.4S, v14.4S, v31.s[0] -sub v14.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v13.4S, v13.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v13.4S, v25.4S, v31.s[0] -sub v25.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -sqrdmulh v26.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v27.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v9.s[1] -mul v14.4S, v14.4S,v1.s[1] -sub v6.4s, v28.4s, v13.4s -add v28.4s, v28.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v3.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v26.4S, v31.s[0] -nop -nop -sqrdmulh v26.4S, v24.4S, v7.s[2] -mla v11.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v25.4S, v7.s[1] -mla v14.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v10.4S, v7.s[0] -mla v21.4S, v13.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v13.4s, v5.4s, v17.4s -str q13, [x0, #1008] -mla v24.4S, v26.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v10.4S, v10.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v11.4s -str q5, [x0, #880] -mla v10.4S, v20.4S, v31.s[0] -mla v25.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -sub v11.4s, v27.4s, v14.4s -ldr q18, [x0, #960] -sqrdmulh v20.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -add v27.4s, v27.4s, v14.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v14.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v19.4s, v21.4s -str q11, [x0, #752] -ldr q11, [x0, #832] -sqrdmulh v17.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v19.4s, v19.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -str q5, [x0, #624] -sub v5.4s, v6.4s, v4.4s -ldr q12, [x0, #704] -sqrdmulh v26.4S, v12.4S, v29.s[0] -mla v18.4S, v20.4S, v31.s[0] -add v6.4s, v6.4s, v4.4s -str q19, [x0, #560] -ldr q19, [x0, #640] -sqrdmulh v4.4S, v19.4S, v29.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v24.4s -str q5, [x0, #496] -ldr q5, [x0, #576] -sqrdmulh v20.4S, v5.4S, v29.s[0] -mla v11.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q6, [x0, #432] -ldr q6, [x0, #512] -sqrdmulh v24.4S, v6.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v25.4s -str q14, [x0, #368] -ldr q14, [x0, #448] -ldr q17, [x0, #384] -mul v19.4S, v19.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v3.4s, v3.4s, v25.4s -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q25, [x0, #256] -mla v19.4S, v4.4S, v31.s[0] -mla v12.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v10.4s -str q21, [x0, #240] -ldr q21, [x0, #192] -ldr q4, [x0, #128] -mul v6.4S, v6.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v0.4s, v0.4s, v10.4s -str q3, [x0, #176] -ldr q3, [x0, #64] -ldr q10, [x0, #0] -mla v6.4S, v24.4S, v31.s[0] -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v13.4s, v28.4s, v11.4s -add v28.4s, v28.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v15.4s, v25.4s, v27.4s -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v16.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v29.s[2] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v4.4s, v19.4s -add v4.4s, v4.4s, v19.4s -sqrdmulh v19.4S, v15.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v3.4s, v5.4s -add v3.4s, v3.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v29.s[1] -mla v14.4S, v11.4S, v31.s[0] -str q26, [x0, #112] -sub v26.4s, v10.4s, v6.4s -sqrdmulh v11.4S, v25.4S, v29.s[1] -mla v17.4S, v27.4S, v31.s[0] -str q0, [x0, #48] -add v10.4s, v10.4s, v6.4s -mul v15.4S, v15.4S,v30.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v6.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v15.4S, v19.4S, v31.s[0] -mla v13.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v24.4s -add v18.4s, v18.4s, v24.4s -mul v25.4S, v25.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -mla v25.4S, v11.4S, v31.s[0] -mla v28.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v17.4s -add v4.4s, v4.4s, v17.4s -sqrdmulh v17.4S, v6.4S, v22.s[3] -mul v6.4S, v6.4S,v23.s[3] -sub v11.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v14.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -sqrdmulh v15.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v19.4s, v3.4s, v28.4s -add v3.4s, v3.4s, v28.4s -sqrdmulh v28.4S, v21.4S, v22.s[0] -mul v21.4S, v21.4S,v23.s[0] -sub v20.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -sqrdmulh v25.4S, v12.4S, v22.s[3] -mla v6.4S, v17.4S, v31.s[0] -nop -nop -sqrdmulh v17.4S, v18.4S, v22.s[2] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v5.4S, v22.s[1] -mla v24.4S, v15.4S, v31.s[0] -nop -nop -sqrdmulh v15.4S, v4.4S, v22.s[0] -mla v21.4S, v28.4S, v31.s[0] -nop -nop -mul v18.4S, v18.4S,v23.s[2] -mul v12.4S, v12.4S,v23.s[3] -sub v28.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -mla v18.4S, v17.4S, v31.s[0] -mla v12.4S, v25.4S, v31.s[0] -sub v25.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v5.4S, v5.4S,v23.s[1] -sub v16.4s, v19.4s, v24.4s -add v19.4s, v19.4s, v24.4s -mla v4.4S, v15.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sqrdmulh v21.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v15.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sqrdmulh v12.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v24.4s, v26.4s, v18.4s -add v26.4s, v26.4s, v18.4s -sqrdmulh v18.4S, v25.4S, v9.s[1] -mul v25.4S, v25.4S,v1.s[1] -sub v17.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v21.4S, v31.s[0] -nop -nop -sqrdmulh v21.4S, v19.4S, v7.s[2] -mla v11.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v13.4S, v7.s[1] -mla v25.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v3.4S, v7.s[0] -mla v8.4S, v5.4S, v31.s[0] -nop -nop -mul v19.4S, v19.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v5.4s, v15.4s, v28.4s -str q5, [x0, #960] -mla v19.4S, v21.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v15.4s, v15.4s, v28.4s -str q15, [x0, #896] -mul v3.4S, v3.4S,v2.s[0] -mul v13.4S, v13.4S,v2.s[1] -sub v15.4s, v14.4s, v11.4s -str q15, [x0, #832] -mla v3.4S, v18.4S, v31.s[0] -mla v13.4S, v12.4S, v31.s[0] -add v14.4s, v14.4s, v11.4s -sub v11.4s, v24.4s, v25.4s -ldr q12, [x0, #976] -sqrdmulh v18.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v24.4s, v24.4s, v25.4s -str q14, [x0, #768] -ldr q14, [x0, #912] -sqrdmulh v25.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v15.4s, v26.4s, v8.4s -str q11, [x0, #704] -ldr q11, [x0, #848] -sqrdmulh v28.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -str q15, [x0, #576] -sub v15.4s, v17.4s, v16.4s -ldr q4, [x0, #720] -sqrdmulh v21.4S, v4.4S, v29.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v14.4S, v25.4S, v31.s[0] -sub v25.4s, v20.4s, v19.4s -str q15, [x0, #448] -ldr q15, [x0, #592] -sqrdmulh v18.4S, v15.4S, v29.s[0] -mla v11.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v19.4s -str q17, [x0, #384] -ldr q17, [x0, #528] -sqrdmulh v19.4S, v17.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v6.4s, v13.4s -str q25, [x0, #320] -ldr q25, [x0, #464] -ldr q28, [x0, #400] -mul v26.4S, v26.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v6.4s, v6.4s, v13.4s -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q13, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v3.4s -str q8, [x0, #192] -ldr q8, [x0, #208] -ldr q16, [x0, #144] -mul v17.4S, v17.4S,v30.s[0] -mul v15.4S, v15.4S,v30.s[0] -add v10.4s, v10.4s, v3.4s -str q6, [x0, #128] -ldr q6, [x0, #80] -ldr q3, [x0, #16] -mla v17.4S, v19.4S, v31.s[0] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v25.4s, v12.4s -add v25.4s, v25.4s, v12.4s -sqrdmulh v12.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v19.4s, v28.4s, v14.4s -add v28.4s, v28.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v5.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v0.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -sqrdmulh v24.4S, v28.4S, v29.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v27.4s, v8.4s, v4.4s -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v5.4S, v29.s[2] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v0.4S, v29.s[2] -mla v19.4S, v14.4S, v31.s[0] -sub v14.4s, v6.4s, v15.4s -add v6.4s, v6.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[1] -mla v25.4S, v11.4S, v31.s[0] -str q21, [x0, #64] -sub v21.4s, v3.4s, v17.4s -sqrdmulh v11.4S, v13.4S, v29.s[1] -mla v28.4S, v24.4S, v31.s[0] -str q10, [x0, #0] -add v3.4s, v3.4s, v17.4s -mul v0.4S, v0.4S,v30.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v17.4s, v27.4s, v18.4s -add v27.4s, v27.4s, v18.4s -mla v0.4S, v26.4S, v31.s[0] -mla v5.4S, v4.4S, v31.s[0] -sub v4.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -mul v13.4S, v13.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v19.4s, v8.4s, v25.4s -add v8.4s, v8.4s, v25.4s -mla v13.4S, v11.4S, v31.s[0] -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v28.4s -add v16.4s, v16.4s, v28.4s -sqrdmulh v29.4S, v17.4S, v22.s[3] -mul v17.4S, v17.4S,v23.s[3] -sub v30.4s, v14.4s, v5.4s -add v14.4s, v14.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v28.4s, v21.4s, v0.4s -add v21.4s, v21.4s, v0.4s -sqrdmulh v0.4S, v19.4S, v22.s[1] -mul v19.4S, v19.4S,v23.s[1] -sub v11.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v22.s[0] -mul v8.4S, v8.4S,v23.s[0] -sub v25.4s, v3.4s, v13.4s -add v3.4s, v3.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v22.s[3] -mla v17.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v12.4S, v22.s[2] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v15.4S, v22.s[1] -mla v19.4S, v0.4S, v31.s[0] -nop -nop -sqrdmulh v0.4S, v16.4S, v22.s[0] -mla v8.4S, v20.4S, v31.s[0] -nop -nop -mul v12.4S, v12.4S,v23.s[2] -mul v4.4S, v4.4S,v23.s[3] -sub v20.4s, v30.4s, v17.4s -add v30.4s, v30.4s, v17.4s -mla v12.4S, v29.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v14.4s, v27.4s -add v14.4s, v14.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v15.4S, v15.4S,v23.s[1] -sub v27.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -mla v16.4S, v0.4S, v31.s[0] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v28.4s, v4.4s -add v28.4s, v28.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v8.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v13.4S, v9.s[1] -mul v13.4S, v13.4S,v1.s[1] -sub v0.4s, v25.4s, v15.4s -add v25.4s, v25.4s, v15.4s -sqrdmulh v15.4S, v14.4S, v9.s[0] -mul v14.4S, v14.4S,v1.s[0] -sub v19.4s, v3.4s, v16.4s -add v3.4s, v3.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v11.4S, v7.s[2] -mla v30.4S, v4.4S, v31.s[0] -nop -nop -sqrdmulh v4.4S, v5.4S, v7.s[1] -mla v13.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v6.4S, v7.s[0] -mla v14.4S, v15.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v15.4s, v23.4s, v20.4s -str q15, [x0, #976] -mla v11.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v6.4S, v6.4S,v2.s[0] -mul v5.4S, v5.4S,v2.s[1] -sub v23.4s, v28.4s, v30.4s -str q23, [x0, #848] -mla v6.4S, v12.4S, v31.s[0] -mla v5.4S, v4.4S, v31.s[0] -add v28.4s, v28.4s, v30.4s -sub v30.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -str q28, [x0, #784] -sub v28.4s, v21.4s, v14.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v14.4s -str q8, [x0, #656] -str q28, [x0, #592] -sub v28.4s, v0.4s, v27.4s -add v0.4s, v0.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v25.4s, v11.4s -str q28, [x0, #464] -add v25.4s, v25.4s, v11.4s -str q0, [x0, #400] -sub v0.4s, v19.4s, v5.4s -str q21, [x0, #336] -add v19.4s, v19.4s, v5.4s -str q25, [x0, #272] -sub v25.4s, v3.4s, v6.4s -str q0, [x0, #208] -add v3.4s, v3.4s, v6.4s -str q19, [x0, #144] -str q25, [x0, #80] -str q3, [x0, #16] -ldr q24, [x0, #224] -ldr q10, [x0, #160] -ldr q18, [x0, #32] -ldr q26, [x17, #+128] -ldr q17, [x17, #+144] -sqrdmulh v29.4S, v18.4S, v17.s[0] -mul v18.4S, v18.4S,v26.s[0] -ldr q16, [x0, #48] -sqrdmulh v1.4S, v16.4S, v17.s[0] -mul v16.4S, v16.4S,v26.s[0] -ldr q15, [x17, #+160] -ldr q22, [x17, #+176] -ldr q9, [x0, #96] -sqrdmulh v20.4S, v9.4S, v22.s[0] -mul v9.4S, v9.4S,v15.s[0] -ldr q23, [x0, #112] -sqrdmulh v12.4S, v23.4S, v22.s[0] -mul v23.4S, v23.4S,v15.s[0] -ldr q4, [x17, #+192] -ldr q2, [x17, #+208] -mla v18.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v10.4S, v2.s[0] -ldr q7, [x0, #176] -mla v16.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v7.4S, v2.s[0] -ldr q13, [x17, #+224] -ldr q30, [x17, #+240] -mla v9.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v24.4S, v30.s[0] -ldr q14, [x0, #240] -mla v23.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v14.4S, v30.s[0] -ldr q8, [x0, #0] -ldr q27, [x0, #128] -mul v10.4S, v10.4S,v4.s[0] -sub v28.4s, v8.4s, v18.4s -ldr q11, [x0, #16] -mul v7.4S, v7.4S,v4.s[0] -add v8.4s, v8.4s, v18.4s -ldr q18, [x0, #144] -mla v10.4S, v29.4S, v31.s[0] -sub v29.4s, v11.4s, v16.4s -ldr q21, [x0, #64] -mla v7.4S, v1.4S, v31.s[0] -add v11.4s, v11.4s, v16.4s -ldr q16, [x0, #192] -mul v24.4S, v24.4S,v13.s[0] -sub v1.4s, v21.4s, v9.4s -ldr q5, [x0, #80] -mul v14.4S, v14.4S,v13.s[0] -add v21.4s, v21.4s, v9.4s -ldr q9, [x0, #208] -mla v24.4S, v20.4S, v31.s[0] -mla v14.4S, v12.4S, v31.s[0] -sub v12.4s, v5.4s, v23.4s -sqrdmulh v20.4S, v11.4S, v17.s[1] -add v5.4s, v5.4s, v23.4s -mul v11.4S, v11.4S,v26.s[1] -sqrdmulh v23.4S, v29.4S, v17.s[2] -sub v0.4s, v27.4s, v10.4s -mul v29.4S, v29.4S,v26.s[2] -add v27.4s, v27.4s, v10.4s -sqrdmulh v17.4S, v5.4S, v22.s[1] -sub v26.4s, v18.4s, v7.4s -mul v5.4S, v5.4S,v15.s[1] -add v18.4s, v18.4s, v7.4s -sqrdmulh v7.4S, v12.4S, v22.s[2] -sub v10.4s, v16.4s, v24.4s -mul v12.4S, v12.4S,v15.s[2] -add v16.4s, v16.4s, v24.4s -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v9.4s, v14.4s -ldr q22, [x0, #480] -sqrdmulh v15.4S, v18.4S, v2.s[1] -add v9.4s, v9.4s, v14.4s -mla v29.4S, v23.4S, v31.s[0] -ldr q23, [x0, #416] -sqrdmulh v14.4S, v26.4S, v2.s[2] -sub v24.4s, v8.4s, v11.4s -mla v5.4S, v17.4S, v31.s[0] -ldr q17, [x0, #288] -sqrdmulh v6.4S, v9.4S, v30.s[1] -add v8.4s, v8.4s, v11.4s -str q24, [x0, #16] -mla v12.4S, v7.4S, v31.s[0] -ldr q7, [x17, #+256] -ldr q24, [x17, #+272] -sqrdmulh v11.4S, v20.4S, v30.s[2] -sub v19.4s, v28.4s, v29.4s -str q8, [x0, #0] -mul v18.4S, v18.4S,v4.s[1] -add v28.4s, v28.4s, v29.4s -mul v26.4S, v26.4S,v4.s[2] -str q19, [x0, #48] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v21.4s, v5.4s -mla v26.4S, v14.4S, v31.s[0] -str q28, [x0, #32] -mul v9.4S, v9.4S,v13.s[1] -str q15, [x0, #80] -mul v20.4S, v20.4S,v13.s[2] -add v21.4s, v21.4s, v5.4s -str q21, [x0, #64] -mla v9.4S, v6.4S, v31.s[0] -sub v6.4s, v1.4s, v12.4s -str q6, [x0, #112] -mla v20.4S, v11.4S, v31.s[0] -add v1.4s, v1.4s, v12.4s -str q1, [x0, #96] -sqrdmulh v30.4S, v17.4S, v24.s[0] -sub v13.4s, v27.4s, v18.4s -mul v17.4S, v17.4S,v7.s[0] -str q13, [x0, #144] -ldr q13, [x0, #304] -sqrdmulh v1.4S, v13.4S, v24.s[0] -add v27.4s, v27.4s, v18.4s -mul v13.4S, v13.4S,v7.s[0] -str q27, [x0, #128] -ldr q27, [x17, #+288] -ldr q18, [x17, #+304] -ldr q12, [x0, #352] -sqrdmulh v11.4S, v12.4S, v18.s[0] -sub v6.4s, v0.4s, v26.4s -mul v12.4S, v12.4S,v27.s[0] -str q6, [x0, #176] -ldr q6, [x0, #368] -sqrdmulh v21.4S, v6.4S, v18.s[0] -add v0.4s, v0.4s, v26.4s -mul v6.4S, v6.4S,v27.s[0] -str q0, [x0, #160] -ldr q0, [x17, #+320] -ldr q26, [x17, #+336] -mla v17.4S, v30.4S, v31.s[0] -sub v30.4s, v16.4s, v9.4s -sqrdmulh v5.4S, v23.4S, v26.s[0] -str q30, [x0, #208] -ldr q30, [x0, #432] -mla v13.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v30.4S, v26.s[0] -str q16, [x0, #192] -ldr q16, [x17, #+352] -ldr q1, [x17, #+368] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v20.4s -sqrdmulh v15.4S, v22.4S, v1.s[0] -str q11, [x0, #240] -ldr q11, [x0, #496] -mla v6.4S, v21.4S, v31.s[0] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v1.s[0] -str q10, [x0, #224] -ldr q10, [x0, #256] -ldr q21, [x0, #384] -mul v23.4S, v23.4S,v0.s[0] -sub v2.4s, v10.4s, v17.4s -ldr q4, [x0, #272] -mul v30.4S, v30.4S,v0.s[0] -add v10.4s, v10.4s, v17.4s -ldr q17, [x0, #400] -mla v23.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v13.4s -ldr q28, [x0, #320] -mla v30.4S, v9.4S, v31.s[0] -add v4.4s, v4.4s, v13.4s -ldr q13, [x0, #448] -mul v22.4S, v22.4S,v16.s[0] -sub v9.4s, v28.4s, v12.4s -ldr q14, [x0, #336] -mul v11.4S, v11.4S,v16.s[0] -add v28.4s, v28.4s, v12.4s -ldr q12, [x0, #464] -mla v22.4S, v15.4S, v31.s[0] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v14.4s, v6.4s -sqrdmulh v15.4S, v4.4S, v24.s[1] -add v14.4s, v14.4s, v6.4s -mul v4.4S, v4.4S,v7.s[1] -sqrdmulh v6.4S, v5.4S, v24.s[2] -sub v19.4s, v21.4s, v23.4s -mul v5.4S, v5.4S,v7.s[2] -add v21.4s, v21.4s, v23.4s -sqrdmulh v24.4S, v14.4S, v18.s[1] -sub v7.4s, v17.4s, v30.4s -mul v14.4S, v14.4S,v27.s[1] -add v17.4s, v17.4s, v30.4s -sqrdmulh v30.4S, v20.4S, v18.s[2] -sub v23.4s, v13.4s, v22.4s -mul v20.4S, v20.4S,v27.s[2] -add v13.4s, v13.4s, v22.4s -mla v4.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v11.4s -ldr q18, [x0, #736] -sqrdmulh v27.4S, v17.4S, v26.s[1] -add v12.4s, v12.4s, v11.4s -mla v5.4S, v6.4S, v31.s[0] -ldr q6, [x0, #672] -sqrdmulh v11.4S, v7.4S, v26.s[2] -sub v22.4s, v10.4s, v4.4s -mla v14.4S, v24.4S, v31.s[0] -ldr q24, [x0, #544] -sqrdmulh v29.4S, v12.4S, v1.s[1] -add v10.4s, v10.4s, v4.4s -str q22, [x0, #272] -mla v20.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+384] -ldr q22, [x17, #+400] -sqrdmulh v4.4S, v15.4S, v1.s[2] -sub v8.4s, v2.4s, v5.4s -str q10, [x0, #256] -mul v17.4S, v17.4S,v0.s[1] -add v2.4s, v2.4s, v5.4s -mul v7.4S, v7.4S,v0.s[2] -str q8, [x0, #304] -mla v17.4S, v27.4S, v31.s[0] -sub v27.4s, v28.4s, v14.4s -mla v7.4S, v11.4S, v31.s[0] -str q2, [x0, #288] -mul v12.4S, v12.4S,v16.s[1] -str q27, [x0, #336] -mul v15.4S, v15.4S,v16.s[2] -add v28.4s, v28.4s, v14.4s -str q28, [x0, #320] -mla v12.4S, v29.4S, v31.s[0] -sub v29.4s, v9.4s, v20.4s -str q29, [x0, #368] -mla v15.4S, v4.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q9, [x0, #352] -sqrdmulh v1.4S, v24.4S, v22.s[0] -sub v16.4s, v21.4s, v17.4s -mul v24.4S, v24.4S,v30.s[0] -str q16, [x0, #400] -ldr q16, [x0, #560] -sqrdmulh v9.4S, v16.4S, v22.s[0] -add v21.4s, v21.4s, v17.4s -mul v16.4S, v16.4S,v30.s[0] -str q21, [x0, #384] -ldr q21, [x17, #+416] -ldr q17, [x17, #+432] -ldr q20, [x0, #608] -sqrdmulh v4.4S, v20.4S, v17.s[0] -sub v29.4s, v19.4s, v7.4s -mul v20.4S, v20.4S,v21.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v28.4S, v29.4S, v17.s[0] -add v19.4s, v19.4s, v7.4s -mul v29.4S, v29.4S,v21.s[0] -str q19, [x0, #416] -ldr q19, [x17, #+448] -ldr q7, [x17, #+464] -mla v24.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v12.4s -sqrdmulh v14.4S, v6.4S, v7.s[0] -str q1, [x0, #464] -ldr q1, [x0, #688] -mla v16.4S, v9.4S, v31.s[0] -add v13.4s, v13.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v7.s[0] -str q13, [x0, #448] -ldr q13, [x17, #+480] -ldr q9, [x17, #+496] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v23.4s, v15.4s -sqrdmulh v27.4S, v18.4S, v9.s[0] -str q4, [x0, #496] -ldr q4, [x0, #752] -mla v29.4S, v28.4S, v31.s[0] -add v23.4s, v23.4s, v15.4s -sqrdmulh v15.4S, v4.4S, v9.s[0] -str q23, [x0, #480] -ldr q23, [x0, #512] -ldr q28, [x0, #640] -mul v6.4S, v6.4S,v19.s[0] -sub v26.4s, v23.4s, v24.4s -ldr q0, [x0, #528] -mul v1.4S, v1.4S,v19.s[0] -add v23.4s, v23.4s, v24.4s -ldr q24, [x0, #656] -mla v6.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v16.4s -ldr q2, [x0, #576] -mla v1.4S, v12.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -ldr q16, [x0, #704] -mul v18.4S, v18.4S,v13.s[0] -sub v12.4s, v2.4s, v20.4s -ldr q11, [x0, #592] -mul v4.4S, v4.4S,v13.s[0] -add v2.4s, v2.4s, v20.4s -ldr q20, [x0, #720] -mla v18.4S, v27.4S, v31.s[0] -mla v4.4S, v15.4S, v31.s[0] -sub v15.4s, v11.4s, v29.4s -sqrdmulh v27.4S, v0.4S, v22.s[1] -add v11.4s, v11.4s, v29.4s -mul v0.4S, v0.4S,v30.s[1] -sqrdmulh v29.4S, v14.4S, v22.s[2] -sub v8.4s, v28.4s, v6.4s -mul v14.4S, v14.4S,v30.s[2] -add v28.4s, v28.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v17.s[1] -sub v30.4s, v24.4s, v1.4s -mul v11.4S, v11.4S,v21.s[1] -add v24.4s, v24.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v17.s[2] -sub v6.4s, v16.4s, v18.4s -mul v15.4S, v15.4S,v21.s[2] -add v16.4s, v16.4s, v18.4s -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v4.4s -ldr q17, [x0, #992] -sqrdmulh v21.4S, v24.4S, v7.s[1] -add v20.4s, v20.4s, v4.4s -mla v14.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v4.4S, v30.4S, v7.s[2] -sub v18.4s, v23.4s, v0.4s -mla v11.4S, v22.4S, v31.s[0] -ldr q22, [x0, #800] -sqrdmulh v5.4S, v20.4S, v9.s[1] -add v23.4s, v23.4s, v0.4s -str q18, [x0, #528] -mla v15.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+512] -ldr q18, [x17, #+528] -sqrdmulh v0.4S, v27.4S, v9.s[2] -sub v10.4s, v26.4s, v14.4s -str q23, [x0, #512] -mul v24.4S, v24.4S,v19.s[1] -add v26.4s, v26.4s, v14.4s -mul v30.4S, v30.4S,v19.s[2] -str q10, [x0, #560] -mla v24.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v11.4s -mla v30.4S, v4.4S, v31.s[0] -str q26, [x0, #544] -mul v20.4S, v20.4S,v13.s[1] -str q21, [x0, #592] -mul v27.4S, v27.4S,v13.s[2] -add v2.4s, v2.4s, v11.4s -str q2, [x0, #576] -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v12.4s, v15.4s -str q5, [x0, #624] -mla v27.4S, v0.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -str q12, [x0, #608] -sqrdmulh v9.4S, v22.4S, v18.s[0] -sub v13.4s, v28.4s, v24.4s -mul v22.4S, v22.4S,v1.s[0] -str q13, [x0, #656] -ldr q13, [x0, #816] -sqrdmulh v12.4S, v13.4S, v18.s[0] -add v28.4s, v28.4s, v24.4s -mul v13.4S, v13.4S,v1.s[0] -str q28, [x0, #640] -ldr q28, [x17, #+544] -ldr q24, [x17, #+560] -ldr q15, [x0, #864] -sqrdmulh v0.4S, v15.4S, v24.s[0] -sub v5.4s, v8.4s, v30.4s -mul v15.4S, v15.4S,v28.s[0] -str q5, [x0, #688] -ldr q5, [x0, #880] -sqrdmulh v2.4S, v5.4S, v24.s[0] -add v8.4s, v8.4s, v30.4s -mul v5.4S, v5.4S,v28.s[0] -str q8, [x0, #672] -ldr q8, [x17, #+576] -ldr q30, [x17, #+592] -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v16.4s, v20.4s -sqrdmulh v11.4S, v29.4S, v30.s[0] -str q9, [x0, #720] -ldr q9, [x0, #944] -mla v13.4S, v12.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v30.s[0] -str q16, [x0, #704] -ldr q16, [x17, #+608] -ldr q12, [x17, #+624] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v6.4s, v27.4s -sqrdmulh v21.4S, v17.4S, v12.s[0] -str q0, [x0, #752] -ldr q0, [x0, #1008] -mla v5.4S, v2.4S, v31.s[0] -add v6.4s, v6.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v12.s[0] -str q6, [x0, #736] -ldr q6, [x0, #768] -ldr q2, [x0, #896] -mul v29.4S, v29.4S,v8.s[0] -sub v7.4s, v6.4s, v22.4s -ldr q19, [x0, #784] -mul v9.4S, v9.4S,v8.s[0] -add v6.4s, v6.4s, v22.4s -ldr q22, [x0, #912] -mla v29.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v13.4s -ldr q26, [x0, #832] -mla v9.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v13.4s -ldr q13, [x0, #960] -mul v17.4S, v17.4S,v16.s[0] -sub v20.4s, v26.4s, v15.4s -ldr q4, [x0, #848] -mul v0.4S, v0.4S,v16.s[0] -add v26.4s, v26.4s, v15.4s -ldr q15, [x0, #976] -mla v17.4S, v21.4S, v31.s[0] -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v4.4s, v5.4s -sqrdmulh v21.4S, v19.4S, v18.s[1] -add v4.4s, v4.4s, v5.4s -mul v19.4S, v19.4S,v1.s[1] -sqrdmulh v5.4S, v11.4S, v18.s[2] -sub v10.4s, v2.4s, v29.4s -mul v11.4S, v11.4S,v1.s[2] -add v2.4s, v2.4s, v29.4s -sqrdmulh v18.4S, v4.4S, v24.s[1] -sub v1.4s, v22.4s, v9.4s -mul v4.4S, v4.4S,v28.s[1] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v27.4S, v24.s[2] -sub v29.4s, v13.4s, v17.4s -mul v27.4S, v27.4S,v28.s[2] -add v13.4s, v13.4s, v17.4s -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v0.4s -sqrdmulh v24.4S, v22.4S, v30.s[1] -add v15.4s, v15.4s, v0.4s -mla v11.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v1.4S, v30.s[2] -sub v0.4s, v6.4s, v19.4s -mla v4.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v15.4S, v12.s[1] -add v6.4s, v6.4s, v19.4s -str q0, [x0, #784] -mla v27.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v21.4S, v12.s[2] -sub v0.4s, v7.4s, v11.4s -str q6, [x0, #768] -mul v22.4S, v22.4S,v8.s[1] -add v7.4s, v7.4s, v11.4s -mul v1.4S, v1.4S,v8.s[2] -str q0, [x0, #816] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v4.4s -mla v1.4S, v5.4S, v31.s[0] -str q7, [x0, #800] -mul v15.4S, v15.4S,v16.s[1] -str q24, [x0, #848] -mul v21.4S, v21.4S,v16.s[2] -add v26.4s, v26.4s, v4.4s -str q26, [x0, #832] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v27.4s -str q18, [x0, #880] -mla v21.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v27.4s -str q20, [x0, #864] -sub v12.4s, v2.4s, v22.4s -str q12, [x0, #912] -add v2.4s, v2.4s, v22.4s -str q2, [x0, #896] -sub v2.4s, v10.4s, v1.4s -str q2, [x0, #944] -add v10.4s, v10.4s, v1.4s -str q10, [x0, #928] -sub v10.4s, v13.4s, v15.4s -str q10, [x0, #976] -add v13.4s, v13.4s, v15.4s -str q13, [x0, #960] -sub v13.4s, v29.4s, v21.4s -str q13, [x0, #1008] -add v29.4s, v29.4s, v21.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1528 -// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s deleted file mode 100644 index 9f6d143..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_10.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_10: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #224] -ldr q25, [x0, #160] -ldr q13, [x0, #32] -ldr q26, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v29.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v26.s[0] -ldr q16, [x0, #48] -ldr q1, [x17, #+160] -sqrdmulh v4.4S, v16.4S, v15.s[0] -mul v16.4S, v16.4S,v26.s[0] -ldr q22, [x17, #+176] -ldr q9, [x0, #96] -sqrdmulh v20.4S, v9.4S, v22.s[0] -mul v9.4S, v9.4S,v1.s[0] -ldr q23, [x0, #112] -sqrdmulh v5.4S, v23.4S, v22.s[0] -mul v23.4S, v23.4S,v1.s[0] -ldr q3, [x17, #+192] -ldr q2, [x17, #+208] -mla v13.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v25.4S, v2.s[0] -ldr q7, [x0, #176] -mla v16.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v7.4S, v2.s[0] -ldr q28, [x17, #+224] -ldr q30, [x17, #+240] -mla v9.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v24.4S, v30.s[0] -ldr q6, [x0, #240] -mla v23.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v6.4S, v30.s[0] -ldr q18, [x0, #0] -ldr q27, [x0, #128] -mul v25.4S, v25.4S,v3.s[0] -mul v7.4S, v7.4S,v3.s[0] -mla v25.4S, v29.4S, v31.s[0] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v18.4s, v13.4s -ldr q29, [x0, #64] -add v18.4s, v18.4s, v13.4s -ldr q13, [x0, #192] -mul v24.4S, v24.4S,v28.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v19.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -mla v24.4S, v20.4S, v31.s[0] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v9.4s -add v29.4s, v29.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v15.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v20.4s, v17.4s, v23.4s -add v17.4s, v17.4s, v23.4s -sqrdmulh v23.4S, v19.4S, v15.s[2] -mul v19.4S, v19.4S,v26.s[2] -sub v16.4s, v27.4s, v25.4s -add v27.4s, v27.4s, v25.4s -sqrdmulh v15.4S, v17.4S, v22.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v11.4s, v7.4s -add v11.4s, v11.4s, v7.4s -sqrdmulh v7.4S, v20.4S, v22.s[2] -mul v20.4S, v20.4S,v1.s[2] -sub v26.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mla v10.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v11.4S, v2.s[1] -sub v22.4s, v14.4s, v6.4s -ldr q24, [x0, #480] -add v14.4s, v14.4s, v6.4s -mla v19.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v25.4S, v2.s[2] -sub v6.4s, v18.4s, v10.4s -ldr q1, [x0, #416] -str q6, [x0, #16] -mla v17.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v14.4S, v30.s[1] -add v18.4s, v18.4s, v10.4s -ldr q10, [x0, #288] -str q18, [x0, #0] -mla v20.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v22.4S, v30.s[2] -sub v18.4s, v4.4s, v19.4s -ldr q6, [x17, #+256] -str q18, [x0, #48] -mul v11.4S, v11.4S,v3.s[1] -mul v25.4S, v25.4S,v3.s[2] -add v4.4s, v4.4s, v19.4s -str q4, [x0, #32] -ldr q4, [x17, #+272] -mla v11.4S, v9.4S, v31.s[0] -mla v25.4S, v23.4S, v31.s[0] -sub v23.4s, v29.4s, v17.4s -str q23, [x0, #80] -mul v14.4S, v14.4S,v28.s[1] -mul v22.4S, v22.4S,v28.s[2] -add v29.4s, v29.4s, v17.4s -str q29, [x0, #64] -mla v14.4S, v15.4S, v31.s[0] -mla v22.4S, v7.4S, v31.s[0] -sub v7.4s, v5.4s, v20.4s -str q7, [x0, #112] -sqrdmulh v30.4S, v10.4S, v4.s[0] -mul v10.4S, v10.4S,v6.s[0] -add v5.4s, v5.4s, v20.4s -ldr q20, [x0, #304] -str q5, [x0, #96] -ldr q5, [x17, #+288] -sqrdmulh v7.4S, v20.4S, v4.s[0] -mul v20.4S, v20.4S,v6.s[0] -sub v28.4s, v27.4s, v11.4s -ldr q15, [x17, #+304] -str q28, [x0, #144] -ldr q28, [x0, #352] -sqrdmulh v29.4S, v28.4S, v15.s[0] -mul v28.4S, v28.4S,v5.s[0] -add v27.4s, v27.4s, v11.4s -str q27, [x0, #128] -ldr q27, [x0, #368] -sqrdmulh v11.4S, v27.4S, v15.s[0] -mul v27.4S, v27.4S,v5.s[0] -sub v17.4s, v16.4s, v25.4s -ldr q2, [x17, #+320] -str q17, [x0, #176] -ldr q17, [x17, #+336] -mla v10.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v1.4S, v17.s[0] -add v16.4s, v16.4s, v25.4s -ldr q25, [x0, #432] -str q16, [x0, #160] -mla v20.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v25.4S, v17.s[0] -sub v16.4s, v13.4s, v14.4s -ldr q23, [x17, #+352] -str q16, [x0, #208] -ldr q16, [x17, #+368] -mla v28.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v24.4S, v16.s[0] -add v13.4s, v13.4s, v14.4s -str q13, [x0, #192] -ldr q13, [x0, #496] -mla v27.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v13.4S, v16.s[0] -sub v14.4s, v26.4s, v22.4s -ldr q3, [x0, #256] -str q14, [x0, #240] -ldr q14, [x0, #384] -mul v1.4S, v1.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[0] -add v26.4s, v26.4s, v22.4s -ldr q22, [x0, #272] -str q26, [x0, #224] -ldr q26, [x0, #400] -mla v1.4S, v30.4S, v31.s[0] -mla v25.4S, v7.4S, v31.s[0] -sub v7.4s, v3.4s, v10.4s -ldr q30, [x0, #320] -add v3.4s, v3.4s, v10.4s -ldr q10, [x0, #448] -mul v24.4S, v24.4S,v23.s[0] -mul v13.4S, v13.4S,v23.s[0] -sub v9.4s, v22.4s, v20.4s -ldr q19, [x0, #336] -add v22.4s, v22.4s, v20.4s -ldr q20, [x0, #464] -mla v24.4S, v29.4S, v31.s[0] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v28.4s -add v30.4s, v30.4s, v28.4s -sqrdmulh v28.4S, v22.4S, v4.s[1] -mul v22.4S, v22.4S,v6.s[1] -sub v29.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v9.4S, v4.s[2] -mul v9.4S, v9.4S,v6.s[2] -sub v18.4s, v14.4s, v1.4s -add v14.4s, v14.4s, v1.4s -sqrdmulh v4.4S, v19.4S, v15.s[1] -mul v19.4S, v19.4S,v5.s[1] -sub v1.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v15.s[2] -mul v29.4S, v29.4S,v5.s[2] -sub v6.4s, v10.4s, v24.4s -add v10.4s, v10.4s, v24.4s -mla v22.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v26.4S, v17.s[1] -sub v15.4s, v20.4s, v13.4s -ldr q24, [x0, #736] -add v20.4s, v20.4s, v13.4s -mla v9.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v1.4S, v17.s[2] -sub v13.4s, v3.4s, v22.4s -ldr q5, [x0, #672] -str q13, [x0, #272] -mla v19.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v20.4S, v16.s[1] -add v3.4s, v3.4s, v22.4s -ldr q22, [x0, #544] -str q3, [x0, #256] -mla v29.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v15.4S, v16.s[2] -sub v3.4s, v7.4s, v9.4s -ldr q13, [x17, #+384] -str q3, [x0, #304] -mul v26.4S, v26.4S,v2.s[1] -mul v1.4S, v1.4S,v2.s[2] -add v7.4s, v7.4s, v9.4s -str q7, [x0, #288] -ldr q7, [x17, #+400] -mla v26.4S, v28.4S, v31.s[0] -mla v1.4S, v27.4S, v31.s[0] -sub v27.4s, v30.4s, v19.4s -str q27, [x0, #336] -mul v20.4S, v20.4S,v23.s[1] -mul v15.4S, v15.4S,v23.s[2] -add v30.4s, v30.4s, v19.4s -str q30, [x0, #320] -mla v20.4S, v4.4S, v31.s[0] -mla v15.4S, v25.4S, v31.s[0] -sub v25.4s, v11.4s, v29.4s -str q25, [x0, #368] -sqrdmulh v16.4S, v22.4S, v7.s[0] -mul v22.4S, v22.4S,v13.s[0] -add v11.4s, v11.4s, v29.4s -ldr q29, [x0, #560] -str q11, [x0, #352] -ldr q11, [x17, #+416] -sqrdmulh v25.4S, v29.4S, v7.s[0] -mul v29.4S, v29.4S,v13.s[0] -sub v23.4s, v14.4s, v26.4s -ldr q4, [x17, #+432] -str q23, [x0, #400] -ldr q23, [x0, #608] -sqrdmulh v30.4S, v23.4S, v4.s[0] -mul v23.4S, v23.4S,v11.s[0] -add v14.4s, v14.4s, v26.4s -str q14, [x0, #384] -ldr q14, [x0, #624] -sqrdmulh v26.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v11.s[0] -sub v19.4s, v18.4s, v1.4s -ldr q17, [x17, #+448] -str q19, [x0, #432] -ldr q19, [x17, #+464] -mla v22.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v5.4S, v19.s[0] -add v18.4s, v18.4s, v1.4s -ldr q1, [x0, #688] -str q18, [x0, #416] -mla v29.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v1.4S, v19.s[0] -sub v18.4s, v10.4s, v20.4s -ldr q27, [x17, #+480] -str q18, [x0, #464] -ldr q18, [x17, #+496] -mla v23.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v24.4S, v18.s[0] -add v10.4s, v10.4s, v20.4s -str q10, [x0, #448] -ldr q10, [x0, #752] -mla v14.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v10.4S, v18.s[0] -sub v20.4s, v6.4s, v15.4s -ldr q2, [x0, #512] -str q20, [x0, #496] -ldr q20, [x0, #640] -mul v5.4S, v5.4S,v17.s[0] -mul v1.4S, v1.4S,v17.s[0] -add v6.4s, v6.4s, v15.4s -ldr q15, [x0, #528] -str q6, [x0, #480] -ldr q6, [x0, #656] -mla v5.4S, v16.4S, v31.s[0] -mla v1.4S, v25.4S, v31.s[0] -sub v25.4s, v2.4s, v22.4s -ldr q16, [x0, #576] -add v2.4s, v2.4s, v22.4s -ldr q22, [x0, #704] -mul v24.4S, v24.4S,v27.s[0] -mul v10.4S, v10.4S,v27.s[0] -sub v28.4s, v15.4s, v29.4s -ldr q9, [x0, #592] -add v15.4s, v15.4s, v29.4s -ldr q29, [x0, #720] -mla v24.4S, v30.4S, v31.s[0] -mla v10.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v23.4s -add v16.4s, v16.4s, v23.4s -sqrdmulh v23.4S, v15.4S, v7.s[1] -mul v15.4S, v15.4S,v13.s[1] -sub v30.4s, v9.4s, v14.4s -add v9.4s, v9.4s, v14.4s -sqrdmulh v14.4S, v28.4S, v7.s[2] -mul v28.4S, v28.4S,v13.s[2] -sub v3.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v7.4S, v9.4S, v4.s[1] -mul v9.4S, v9.4S,v11.s[1] -sub v5.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -sqrdmulh v1.4S, v30.4S, v4.s[2] -mul v30.4S, v30.4S,v11.s[2] -sub v13.4s, v22.4s, v24.4s -add v22.4s, v22.4s, v24.4s -mla v15.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v6.4S, v19.s[1] -sub v4.4s, v29.4s, v10.4s -ldr q24, [x0, #992] -add v29.4s, v29.4s, v10.4s -mla v28.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v5.4S, v19.s[2] -sub v10.4s, v2.4s, v15.4s -ldr q11, [x0, #928] -str q10, [x0, #528] -mla v9.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v29.4S, v18.s[1] -add v2.4s, v2.4s, v15.4s -ldr q15, [x0, #800] -str q2, [x0, #512] -mla v30.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v4.4S, v18.s[2] -sub v2.4s, v25.4s, v28.4s -ldr q10, [x17, #+512] -str q2, [x0, #560] -mul v6.4S, v6.4S,v17.s[1] -mul v5.4S, v5.4S,v17.s[2] -add v25.4s, v25.4s, v28.4s -str q25, [x0, #544] -ldr q25, [x17, #+528] -mla v6.4S, v23.4S, v31.s[0] -mla v5.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v9.4s -str q14, [x0, #592] -mul v29.4S, v29.4S,v27.s[1] -mul v4.4S, v4.4S,v27.s[2] -add v16.4s, v16.4s, v9.4s -str q16, [x0, #576] -mla v29.4S, v7.4S, v31.s[0] -mla v4.4S, v1.4S, v31.s[0] -sub v1.4s, v26.4s, v30.4s -str q1, [x0, #624] -sqrdmulh v18.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v10.s[0] -add v26.4s, v26.4s, v30.4s -ldr q30, [x0, #816] -str q26, [x0, #608] -ldr q26, [x17, #+544] -sqrdmulh v1.4S, v30.4S, v25.s[0] -mul v30.4S, v30.4S,v10.s[0] -sub v27.4s, v20.4s, v6.4s -ldr q7, [x17, #+560] -str q27, [x0, #656] -ldr q27, [x0, #864] -sqrdmulh v16.4S, v27.4S, v7.s[0] -mul v27.4S, v27.4S,v26.s[0] -add v20.4s, v20.4s, v6.4s -str q20, [x0, #640] -ldr q20, [x0, #880] -sqrdmulh v6.4S, v20.4S, v7.s[0] -mul v20.4S, v20.4S,v26.s[0] -sub v9.4s, v3.4s, v5.4s -ldr q19, [x17, #+576] -str q9, [x0, #688] -ldr q9, [x17, #+592] -mla v15.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v11.4S, v9.s[0] -add v3.4s, v3.4s, v5.4s -ldr q5, [x0, #944] -str q3, [x0, #672] -mla v30.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v5.4S, v9.s[0] -sub v3.4s, v22.4s, v29.4s -ldr q14, [x17, #+608] -str q3, [x0, #720] -ldr q3, [x17, #+624] -mla v27.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v24.4S, v3.s[0] -add v22.4s, v22.4s, v29.4s -str q22, [x0, #704] -ldr q22, [x0, #1008] -mla v20.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v22.4S, v3.s[0] -sub v29.4s, v13.4s, v4.4s -ldr q17, [x0, #768] -str q29, [x0, #752] -ldr q29, [x0, #896] -mul v11.4S, v11.4S,v19.s[0] -mul v5.4S, v5.4S,v19.s[0] -add v13.4s, v13.4s, v4.4s -ldr q4, [x0, #784] -str q13, [x0, #736] -ldr q13, [x0, #912] -mla v11.4S, v18.4S, v31.s[0] -mla v5.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v15.4s -ldr q18, [x0, #832] -add v17.4s, v17.4s, v15.4s -ldr q15, [x0, #960] -mul v24.4S, v24.4S,v14.s[0] -mul v22.4S, v22.4S,v14.s[0] -sub v23.4s, v4.4s, v30.4s -ldr q28, [x0, #848] -add v4.4s, v4.4s, v30.4s -ldr q30, [x0, #976] -mla v24.4S, v16.4S, v31.s[0] -mla v22.4S, v6.4S, v31.s[0] -sub v6.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v4.4S, v25.s[1] -mul v4.4S, v4.4S,v10.s[1] -sub v16.4s, v28.4s, v20.4s -add v28.4s, v28.4s, v20.4s -sqrdmulh v20.4S, v23.4S, v25.s[2] -mul v23.4S, v23.4S,v10.s[2] -sub v2.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -sqrdmulh v25.4S, v28.4S, v7.s[1] -mul v28.4S, v28.4S,v26.s[1] -sub v11.4s, v13.4s, v5.4s -add v13.4s, v13.4s, v5.4s -sqrdmulh v5.4S, v16.4S, v7.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v10.4s, v15.4s, v24.4s -add v15.4s, v15.4s, v24.4s -mla v4.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v13.4S, v9.s[1] -sub v7.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -mla v23.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v11.4S, v9.s[2] -sub v22.4s, v17.4s, v4.4s -str q22, [x0, #784] -mla v28.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v30.4S, v3.s[1] -add v17.4s, v17.4s, v4.4s -str q17, [x0, #768] -mla v16.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v7.4S, v3.s[2] -sub v17.4s, v1.4s, v23.4s -str q17, [x0, #816] -mul v13.4S, v13.4S,v19.s[1] -mul v11.4S, v11.4S,v19.s[2] -add v1.4s, v1.4s, v23.4s -str q1, [x0, #800] -mla v13.4S, v27.4S, v31.s[0] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v28.4s -str q20, [x0, #848] -mul v30.4S, v30.4S,v14.s[1] -mul v7.4S, v7.4S,v14.s[2] -add v18.4s, v18.4s, v28.4s -str q18, [x0, #832] -mla v30.4S, v25.4S, v31.s[0] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v6.4s, v16.4s -str q5, [x0, #880] -add v6.4s, v6.4s, v16.4s -str q6, [x0, #864] -sub v6.4s, v29.4s, v13.4s -str q6, [x0, #912] -add v29.4s, v29.4s, v13.4s -str q29, [x0, #896] -sub v29.4s, v2.4s, v11.4s -str q29, [x0, #944] -add v2.4s, v2.4s, v11.4s -str q2, [x0, #928] -sub v2.4s, v15.4s, v30.4s -str q2, [x0, #976] -add v15.4s, v15.4s, v30.4s -str q15, [x0, #960] -sub v15.4s, v10.4s, v7.4s -str q15, [x0, #1008] -add v10.4s, v10.4s, v7.4s -str q10, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s deleted file mode 100644 index ba9add4..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_11.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_11: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #32] -ldr q25, [x0, #48] -ldr q13, [x0, #96] -ldr q26, [x0, #112] -ldr q15, [x17, #+128] -ldr q29, [x17, #+144] -ldr q16, [x17, #+160] -ldr q1, [x17, #+176] -ldr q4, [x0, #160] -ldr q22, [x0, #176] -sqrdmulh v9.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v15.s[0] -ldr q20, [x0, #224] -sqrdmulh v23.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v15.s[0] -ldr q5, [x0, #240] -sqrdmulh v3.4S, v13.4S, v1.s[0] -mul v13.4S, v13.4S,v16.s[0] -ldr q2, [x17, #+192] -sqrdmulh v7.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v16.s[0] -ldr q28, [x17, #+208] -mla v24.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v4.4S, v28.s[0] -ldr q30, [x17, #+224] -mla v25.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v22.4S, v28.s[0] -ldr q6, [x17, #+240] -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v20.4S, v6.s[0] -ldr q18, [x0, #0] -mla v26.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v5.4S, v6.s[0] -mul v4.4S, v4.4S,v2.s[0] -mul v22.4S, v22.4S,v2.s[0] -ldr q27, [x0, #64] -mla v4.4S, v9.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v24.4s -add v18.4s, v18.4s, v24.4s -mul v20.4S, v20.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -sub v24.4s, v10.4s, v25.4s -ldr q9, [x0, #128] -add v10.4s, v10.4s, v25.4s -mla v20.4S, v3.4S, v31.s[0] -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v29.s[1] -mul v10.4S, v10.4S,v15.s[1] -sub v3.4s, v17.4s, v26.4s -ldr q25, [x0, #192] -add v17.4s, v17.4s, v26.4s -sqrdmulh v26.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v15.s[2] -sub v19.4s, v9.4s, v4.4s -add v9.4s, v9.4s, v4.4s -sqrdmulh v29.4S, v17.4S, v1.s[1] -mul v17.4S, v17.4S,v16.s[1] -sub v4.4s, v11.4s, v22.4s -ldr q15, [x0, #288] -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v1.s[2] -mul v3.4S, v3.4S,v16.s[2] -sub v8.4s, v25.4s, v20.4s -ldr q21, [x0, #304] -add v25.4s, v25.4s, v20.4s -mla v10.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v11.4S, v28.s[1] -sub v1.4s, v14.4s, v5.4s -ldr q20, [x0, #352] -add v14.4s, v14.4s, v5.4s -mla v24.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v4.4S, v28.s[2] -sub v5.4s, v18.4s, v10.4s -ldr q16, [x0, #368] -str q5, [x0, #16] -mla v17.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v14.4S, v6.s[1] -add v18.4s, v18.4s, v10.4s -ldr q10, [x17, #+256] -str q18, [x0, #0] -mla v3.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v1.4S, v6.s[2] -sub v18.4s, v23.4s, v24.4s -ldr q5, [x17, #+272] -str q18, [x0, #48] -mul v11.4S, v11.4S,v2.s[1] -mul v4.4S, v4.4S,v2.s[2] -add v23.4s, v23.4s, v24.4s -ldr q24, [x17, #+288] -str q23, [x0, #32] -mla v11.4S, v13.4S, v31.s[0] -mla v4.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v17.4s -ldr q13, [x17, #+304] -str q26, [x0, #80] -mul v14.4S, v14.4S,v30.s[1] -mul v1.4S, v1.4S,v30.s[2] -add v27.4s, v27.4s, v17.4s -ldr q17, [x0, #416] -str q27, [x0, #64] -mla v14.4S, v29.4S, v31.s[0] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v3.4s -ldr q29, [x0, #432] -str q22, [x0, #112] -sqrdmulh v6.4S, v15.4S, v5.s[0] -mul v15.4S, v15.4S,v10.s[0] -add v7.4s, v7.4s, v3.4s -ldr q3, [x0, #480] -str q7, [x0, #96] -sqrdmulh v7.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v10.s[0] -sub v22.4s, v9.4s, v11.4s -ldr q30, [x0, #496] -str q22, [x0, #144] -sqrdmulh v22.4S, v20.4S, v13.s[0] -mul v20.4S, v20.4S,v24.s[0] -add v9.4s, v9.4s, v11.4s -ldr q11, [x17, #+320] -str q9, [x0, #128] -sqrdmulh v9.4S, v16.4S, v13.s[0] -mul v16.4S, v16.4S,v24.s[0] -sub v27.4s, v19.4s, v4.4s -ldr q28, [x17, #+336] -str q27, [x0, #176] -mla v15.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v17.4S, v28.s[0] -add v19.4s, v19.4s, v4.4s -ldr q4, [x17, #+352] -str q19, [x0, #160] -mla v21.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v29.4S, v28.s[0] -sub v19.4s, v25.4s, v14.4s -ldr q27, [x17, #+368] -str q19, [x0, #208] -mla v20.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v3.4S, v27.s[0] -add v25.4s, v25.4s, v14.4s -ldr q14, [x0, #256] -str q25, [x0, #192] -mla v16.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v30.4S, v27.s[0] -sub v25.4s, v8.4s, v1.4s -ldr q19, [x0, #272] -str q25, [x0, #240] -mul v17.4S, v17.4S,v11.s[0] -mul v29.4S, v29.4S,v11.s[0] -add v8.4s, v8.4s, v1.4s -ldr q1, [x0, #320] -str q8, [x0, #224] -mla v17.4S, v6.4S, v31.s[0] -mla v29.4S, v7.4S, v31.s[0] -sub v7.4s, v14.4s, v15.4s -ldr q6, [x0, #336] -add v14.4s, v14.4s, v15.4s -mul v3.4S, v3.4S,v4.s[0] -mul v30.4S, v30.4S,v4.s[0] -sub v15.4s, v19.4s, v21.4s -ldr q8, [x0, #384] -add v19.4s, v19.4s, v21.4s -mla v3.4S, v22.4S, v31.s[0] -mla v30.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v20.4s -ldr q22, [x0, #400] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v5.s[1] -mul v19.4S, v19.4S,v10.s[1] -sub v21.4s, v6.4s, v16.4s -ldr q25, [x0, #448] -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v15.4S, v5.s[2] -mul v15.4S, v15.4S,v10.s[2] -sub v26.4s, v8.4s, v17.4s -ldr q2, [x0, #464] -add v8.4s, v8.4s, v17.4s -sqrdmulh v5.4S, v6.4S, v13.s[1] -mul v6.4S, v6.4S,v24.s[1] -sub v17.4s, v22.4s, v29.4s -ldr q10, [x0, #544] -add v22.4s, v22.4s, v29.4s -sqrdmulh v29.4S, v21.4S, v13.s[2] -mul v21.4S, v21.4S,v24.s[2] -sub v23.4s, v25.4s, v3.4s -ldr q18, [x0, #560] -add v25.4s, v25.4s, v3.4s -mla v19.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v22.4S, v28.s[1] -sub v13.4s, v2.4s, v30.4s -ldr q3, [x0, #608] -add v2.4s, v2.4s, v30.4s -mla v15.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v17.4S, v28.s[2] -sub v30.4s, v14.4s, v19.4s -ldr q24, [x0, #624] -str q30, [x0, #272] -mla v6.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v2.4S, v27.s[1] -add v14.4s, v14.4s, v19.4s -ldr q19, [x17, #+384] -str q14, [x0, #256] -mla v21.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v13.4S, v27.s[2] -sub v14.4s, v7.4s, v15.4s -ldr q30, [x17, #+400] -str q14, [x0, #304] -mul v22.4S, v22.4S,v11.s[1] -mul v17.4S, v17.4S,v11.s[2] -add v7.4s, v7.4s, v15.4s -ldr q15, [x17, #+416] -str q7, [x0, #288] -mla v22.4S, v20.4S, v31.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v1.4s, v6.4s -ldr q20, [x17, #+432] -str q16, [x0, #336] -mul v2.4S, v2.4S,v4.s[1] -mul v13.4S, v13.4S,v4.s[2] -add v1.4s, v1.4s, v6.4s -ldr q6, [x0, #672] -str q1, [x0, #320] -mla v2.4S, v5.4S, v31.s[0] -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v9.4s, v21.4s -ldr q5, [x0, #688] -str q29, [x0, #368] -sqrdmulh v27.4S, v10.4S, v30.s[0] -mul v10.4S, v10.4S,v19.s[0] -add v9.4s, v9.4s, v21.4s -ldr q21, [x0, #736] -str q9, [x0, #352] -sqrdmulh v9.4S, v18.4S, v30.s[0] -mul v18.4S, v18.4S,v19.s[0] -sub v29.4s, v8.4s, v22.4s -ldr q4, [x0, #752] -str q29, [x0, #400] -sqrdmulh v29.4S, v3.4S, v20.s[0] -mul v3.4S, v3.4S,v15.s[0] -add v8.4s, v8.4s, v22.4s -ldr q22, [x17, #+448] -str q8, [x0, #384] -sqrdmulh v8.4S, v24.4S, v20.s[0] -mul v24.4S, v24.4S,v15.s[0] -sub v1.4s, v26.4s, v17.4s -ldr q28, [x17, #+464] -str q1, [x0, #432] -mla v10.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v6.4S, v28.s[0] -add v26.4s, v26.4s, v17.4s -ldr q17, [x17, #+480] -str q26, [x0, #416] -mla v18.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v5.4S, v28.s[0] -sub v26.4s, v25.4s, v2.4s -ldr q1, [x17, #+496] -str q26, [x0, #464] -mla v3.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v21.4S, v1.s[0] -add v25.4s, v25.4s, v2.4s -ldr q2, [x0, #512] -str q25, [x0, #448] -mla v24.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v4.4S, v1.s[0] -sub v25.4s, v23.4s, v13.4s -ldr q26, [x0, #528] -str q25, [x0, #496] -mul v6.4S, v6.4S,v22.s[0] -mul v5.4S, v5.4S,v22.s[0] -add v23.4s, v23.4s, v13.4s -ldr q13, [x0, #576] -str q23, [x0, #480] -mla v6.4S, v27.4S, v31.s[0] -mla v5.4S, v9.4S, v31.s[0] -sub v9.4s, v2.4s, v10.4s -ldr q27, [x0, #592] -add v2.4s, v2.4s, v10.4s -mul v21.4S, v21.4S,v17.s[0] -mul v4.4S, v4.4S,v17.s[0] -sub v10.4s, v26.4s, v18.4s -ldr q23, [x0, #640] -add v26.4s, v26.4s, v18.4s -mla v21.4S, v29.4S, v31.s[0] -mla v4.4S, v8.4S, v31.s[0] -sub v8.4s, v13.4s, v3.4s -ldr q29, [x0, #656] -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v26.4S, v30.s[1] -mul v26.4S, v26.4S,v19.s[1] -sub v18.4s, v27.4s, v24.4s -ldr q25, [x0, #704] -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v10.4S, v30.s[2] -mul v10.4S, v10.4S,v19.s[2] -sub v16.4s, v23.4s, v6.4s -ldr q11, [x0, #720] -add v23.4s, v23.4s, v6.4s -sqrdmulh v30.4S, v27.4S, v20.s[1] -mul v27.4S, v27.4S,v15.s[1] -sub v6.4s, v29.4s, v5.4s -ldr q19, [x0, #800] -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v20.s[2] -mul v18.4S, v18.4S,v15.s[2] -sub v7.4s, v25.4s, v21.4s -ldr q14, [x0, #816] -add v25.4s, v25.4s, v21.4s -mla v26.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v29.4S, v28.s[1] -sub v20.4s, v11.4s, v4.4s -ldr q21, [x0, #864] -add v11.4s, v11.4s, v4.4s -mla v10.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v6.4S, v28.s[2] -sub v4.4s, v2.4s, v26.4s -ldr q15, [x0, #880] -str q4, [x0, #528] -mla v27.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v11.4S, v1.s[1] -add v2.4s, v2.4s, v26.4s -ldr q26, [x17, #+512] -str q2, [x0, #512] -mla v18.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v20.4S, v1.s[2] -sub v2.4s, v9.4s, v10.4s -ldr q4, [x17, #+528] -str q2, [x0, #560] -mul v29.4S, v29.4S,v22.s[1] -mul v6.4S, v6.4S,v22.s[2] -add v9.4s, v9.4s, v10.4s -ldr q10, [x17, #+544] -str q9, [x0, #544] -mla v29.4S, v3.4S, v31.s[0] -mla v6.4S, v24.4S, v31.s[0] -sub v24.4s, v13.4s, v27.4s -ldr q3, [x17, #+560] -str q24, [x0, #592] -mul v11.4S, v11.4S,v17.s[1] -mul v20.4S, v20.4S,v17.s[2] -add v13.4s, v13.4s, v27.4s -ldr q27, [x0, #928] -str q13, [x0, #576] -mla v11.4S, v30.4S, v31.s[0] -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v8.4s, v18.4s -ldr q30, [x0, #944] -str q5, [x0, #624] -sqrdmulh v1.4S, v19.4S, v4.s[0] -mul v19.4S, v19.4S,v26.s[0] -add v8.4s, v8.4s, v18.4s -ldr q18, [x0, #992] -str q8, [x0, #608] -sqrdmulh v8.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v5.4s, v23.4s, v29.4s -ldr q17, [x0, #1008] -str q5, [x0, #656] -sqrdmulh v5.4S, v21.4S, v3.s[0] -mul v21.4S, v21.4S,v10.s[0] -add v23.4s, v23.4s, v29.4s -ldr q29, [x17, #+576] -str q23, [x0, #640] -sqrdmulh v23.4S, v15.4S, v3.s[0] -mul v15.4S, v15.4S,v10.s[0] -sub v13.4s, v16.4s, v6.4s -ldr q28, [x17, #+592] -str q13, [x0, #688] -mla v19.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v27.4S, v28.s[0] -add v16.4s, v16.4s, v6.4s -ldr q6, [x17, #+608] -str q16, [x0, #672] -mla v14.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v30.4S, v28.s[0] -sub v16.4s, v25.4s, v11.4s -ldr q13, [x17, #+624] -str q16, [x0, #720] -mla v21.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v18.4S, v13.s[0] -add v25.4s, v25.4s, v11.4s -ldr q11, [x0, #768] -str q25, [x0, #704] -mla v15.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v17.4S, v13.s[0] -sub v25.4s, v7.4s, v20.4s -ldr q16, [x0, #784] -str q25, [x0, #752] -mul v27.4S, v27.4S,v29.s[0] -mul v30.4S, v30.4S,v29.s[0] -add v7.4s, v7.4s, v20.4s -ldr q20, [x0, #832] -str q7, [x0, #736] -mla v27.4S, v1.4S, v31.s[0] -mla v30.4S, v8.4S, v31.s[0] -sub v8.4s, v11.4s, v19.4s -ldr q1, [x0, #848] -add v11.4s, v11.4s, v19.4s -mul v18.4S, v18.4S,v6.s[0] -mul v17.4S, v17.4S,v6.s[0] -sub v19.4s, v16.4s, v14.4s -ldr q7, [x0, #896] -add v16.4s, v16.4s, v14.4s -mla v18.4S, v5.4S, v31.s[0] -mla v17.4S, v23.4S, v31.s[0] -sub v23.4s, v20.4s, v21.4s -ldr q5, [x0, #912] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v4.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v14.4s, v1.4s, v15.4s -ldr q25, [x0, #960] -add v1.4s, v1.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v4.s[2] -mul v19.4S, v19.4S,v26.s[2] -sub v24.4s, v7.4s, v27.4s -ldr q22, [x0, #976] -add v7.4s, v7.4s, v27.4s -sqrdmulh v4.4S, v1.4S, v3.s[1] -mul v1.4S, v1.4S,v10.s[1] -sub v27.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v14.4S, v3.s[2] -mul v14.4S, v14.4S,v10.s[2] -sub v26.4s, v25.4s, v18.4s -add v25.4s, v25.4s, v18.4s -mla v16.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v5.4S, v28.s[1] -sub v3.4s, v22.4s, v17.4s -add v22.4s, v22.4s, v17.4s -mla v19.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v27.4S, v28.s[2] -sub v17.4s, v11.4s, v16.4s -str q17, [x0, #784] -mla v1.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v22.4S, v13.s[1] -add v11.4s, v11.4s, v16.4s -str q11, [x0, #768] -mla v14.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v3.4S, v13.s[2] -sub v11.4s, v8.4s, v19.4s -str q11, [x0, #816] -mul v5.4S, v5.4S,v29.s[1] -mul v27.4S, v27.4S,v29.s[2] -add v8.4s, v8.4s, v19.4s -str q8, [x0, #800] -mla v5.4S, v21.4S, v31.s[0] -mla v27.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v1.4s -str q15, [x0, #848] -mul v22.4S, v22.4S,v6.s[1] -mul v3.4S, v3.4S,v6.s[2] -add v20.4s, v20.4s, v1.4s -str q20, [x0, #832] -mla v22.4S, v4.4S, v31.s[0] -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v23.4s, v14.4s -str q30, [x0, #880] -add v23.4s, v23.4s, v14.4s -str q23, [x0, #864] -sub v23.4s, v7.4s, v5.4s -str q23, [x0, #912] -add v7.4s, v7.4s, v5.4s -str q7, [x0, #896] -sub v7.4s, v24.4s, v27.4s -str q7, [x0, #944] -add v24.4s, v24.4s, v27.4s -str q24, [x0, #928] -sub v24.4s, v25.4s, v22.4s -str q24, [x0, #976] -add v25.4s, v25.4s, v22.4s -str q25, [x0, #960] -sub v25.4s, v26.4s, v3.4s -str q25, [x0, #1008] -add v26.4s, v26.4s, v3.4s -str q26, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s deleted file mode 100644 index 68e1c14..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_12.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_12: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #48] -ldr q25, [x0, #32] -ldr q13, [x0, #112] -ldr q26, [x0, #96] -ldr q15, [x17, #+128] -ldr q29, [x17, #+144] -ldr q16, [x17, #+160] -ldr q1, [x17, #+176] -ldr q4, [x0, #176] -ldr q22, [x0, #160] -sqrdmulh v9.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v15.s[0] -ldr q20, [x0, #240] -sqrdmulh v23.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v15.s[0] -ldr q5, [x0, #224] -sqrdmulh v3.4S, v13.4S, v1.s[0] -mul v13.4S, v13.4S,v16.s[0] -ldr q2, [x17, #+192] -sqrdmulh v7.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v16.s[0] -ldr q28, [x17, #+208] -mla v24.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v4.4S, v28.s[0] -ldr q30, [x17, #+224] -mla v25.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v22.4S, v28.s[0] -ldr q6, [x17, #+240] -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v20.4S, v6.s[0] -mla v26.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v5.4S, v6.s[0] -ldr q18, [x0, #0] -mul v4.4S, v4.4S,v2.s[0] -mul v22.4S, v22.4S,v2.s[0] -mla v4.4S, v9.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v10.4s, v24.4s -ldr q9, [x0, #64] -add v10.4s, v10.4s, v24.4s -mul v20.4S, v20.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -sub v24.4s, v18.4s, v25.4s -add v18.4s, v18.4s, v25.4s -mla v20.4S, v3.4S, v31.s[0] -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v13.4s -ldr q3, [x0, #128] -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v23.4S, v29.s[2] -mul v23.4S, v23.4S,v15.s[2] -sub v25.4s, v9.4s, v26.4s -add v9.4s, v9.4s, v26.4s -sqrdmulh v26.4S, v10.4S, v29.s[1] -mul v10.4S, v10.4S,v15.s[1] -sub v27.4s, v11.4s, v4.4s -ldr q19, [x0, #192] -add v11.4s, v11.4s, v4.4s -sqrdmulh v29.4S, v7.4S, v1.s[2] -mul v7.4S, v7.4S,v16.s[2] -sub v4.4s, v3.4s, v22.4s -ldr q15, [x0, #304] -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v1.s[1] -mul v17.4S, v17.4S,v16.s[1] -sub v8.4s, v14.4s, v20.4s -ldr q21, [x0, #288] -add v14.4s, v14.4s, v20.4s -mla v23.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v27.4S, v28.s[2] -sub v1.4s, v19.4s, v5.4s -ldr q20, [x0, #368] -add v19.4s, v19.4s, v5.4s -mla v10.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v11.4S, v28.s[1] -sub v5.4s, v24.4s, v23.4s -ldr q16, [x0, #352] -str q5, [x0, #48] -mla v7.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v8.4S, v6.s[2] -add v24.4s, v24.4s, v23.4s -ldr q23, [x17, #+256] -str q24, [x0, #32] -mla v17.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v14.4S, v6.s[1] -sub v24.4s, v18.4s, v10.4s -ldr q5, [x17, #+272] -str q24, [x0, #16] -mul v27.4S, v27.4S,v2.s[2] -mul v11.4S, v11.4S,v2.s[1] -add v18.4s, v18.4s, v10.4s -ldr q10, [x17, #+288] -str q18, [x0, #0] -mla v27.4S, v13.4S, v31.s[0] -mla v11.4S, v26.4S, v31.s[0] -sub v26.4s, v25.4s, v7.4s -ldr q13, [x17, #+304] -str q26, [x0, #112] -mul v8.4S, v8.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[1] -add v25.4s, v25.4s, v7.4s -ldr q7, [x0, #432] -str q25, [x0, #96] -mla v8.4S, v29.4S, v31.s[0] -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v17.4s -ldr q29, [x0, #416] -str q22, [x0, #80] -sqrdmulh v6.4S, v15.4S, v5.s[0] -mul v15.4S, v15.4S,v23.s[0] -add v9.4s, v9.4s, v17.4s -ldr q17, [x0, #496] -str q9, [x0, #64] -sqrdmulh v9.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v23.s[0] -sub v22.4s, v4.4s, v27.4s -ldr q30, [x0, #480] -str q22, [x0, #176] -sqrdmulh v22.4S, v20.4S, v13.s[0] -mul v20.4S, v20.4S,v10.s[0] -add v4.4s, v4.4s, v27.4s -ldr q27, [x17, #+320] -str q4, [x0, #160] -sqrdmulh v4.4S, v16.4S, v13.s[0] -mul v16.4S, v16.4S,v10.s[0] -sub v25.4s, v3.4s, v11.4s -ldr q28, [x17, #+336] -str q25, [x0, #144] -mla v15.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v7.4S, v28.s[0] -add v3.4s, v3.4s, v11.4s -ldr q11, [x17, #+352] -str q3, [x0, #128] -mla v21.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v29.4S, v28.s[0] -sub v3.4s, v1.4s, v8.4s -ldr q25, [x17, #+368] -str q3, [x0, #240] -mla v20.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v17.4S, v25.s[0] -add v1.4s, v1.4s, v8.4s -ldr q8, [x0, #272] -str q1, [x0, #224] -mla v16.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v30.4S, v25.s[0] -sub v1.4s, v19.4s, v14.4s -ldr q3, [x0, #256] -str q1, [x0, #208] -mul v7.4S, v7.4S,v27.s[0] -mul v29.4S, v29.4S,v27.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #336] -str q19, [x0, #192] -mla v7.4S, v6.4S, v31.s[0] -mla v29.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v15.4s -ldr q6, [x0, #320] -add v8.4s, v8.4s, v15.4s -mul v17.4S, v17.4S,v11.s[0] -mul v30.4S, v30.4S,v11.s[0] -sub v15.4s, v3.4s, v21.4s -ldr q19, [x0, #400] -add v3.4s, v3.4s, v21.4s -mla v17.4S, v22.4S, v31.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v14.4s, v20.4s -ldr q22, [x0, #384] -add v14.4s, v14.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v5.s[2] -mul v9.4S, v9.4S,v23.s[2] -sub v21.4s, v6.4s, v16.4s -ldr q1, [x0, #464] -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v5.s[1] -mul v8.4S, v8.4S,v23.s[1] -sub v26.4s, v19.4s, v7.4s -ldr q2, [x0, #448] -add v19.4s, v19.4s, v7.4s -sqrdmulh v5.4S, v4.4S, v13.s[2] -mul v4.4S, v4.4S,v10.s[2] -sub v7.4s, v22.4s, v29.4s -ldr q23, [x0, #560] -add v22.4s, v22.4s, v29.4s -sqrdmulh v29.4S, v14.4S, v13.s[1] -mul v14.4S, v14.4S,v10.s[1] -sub v18.4s, v1.4s, v17.4s -ldr q24, [x0, #544] -add v1.4s, v1.4s, v17.4s -mla v9.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v26.4S, v28.s[2] -sub v13.4s, v2.4s, v30.4s -ldr q17, [x0, #624] -add v2.4s, v2.4s, v30.4s -mla v8.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v19.4S, v28.s[1] -sub v30.4s, v15.4s, v9.4s -ldr q10, [x0, #608] -str q30, [x0, #304] -mla v4.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v18.4S, v25.s[2] -add v15.4s, v15.4s, v9.4s -ldr q9, [x17, #+384] -str q15, [x0, #288] -mla v14.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v1.4S, v25.s[1] -sub v15.4s, v3.4s, v8.4s -ldr q30, [x17, #+400] -str q15, [x0, #272] -mul v26.4S, v26.4S,v27.s[2] -mul v19.4S, v19.4S,v27.s[1] -add v3.4s, v3.4s, v8.4s -ldr q8, [x17, #+416] -str q3, [x0, #256] -mla v26.4S, v20.4S, v31.s[0] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v4.4s -ldr q20, [x17, #+432] -str q16, [x0, #368] -mul v18.4S, v18.4S,v11.s[2] -mul v1.4S, v1.4S,v11.s[1] -add v21.4s, v21.4s, v4.4s -ldr q4, [x0, #688] -str q21, [x0, #352] -mla v18.4S, v5.4S, v31.s[0] -mla v1.4S, v29.4S, v31.s[0] -sub v29.4s, v6.4s, v14.4s -ldr q5, [x0, #672] -str q29, [x0, #336] -sqrdmulh v25.4S, v23.4S, v30.s[0] -mul v23.4S, v23.4S,v9.s[0] -add v6.4s, v6.4s, v14.4s -ldr q14, [x0, #752] -str q6, [x0, #320] -sqrdmulh v6.4S, v24.4S, v30.s[0] -mul v24.4S, v24.4S,v9.s[0] -sub v29.4s, v7.4s, v26.4s -ldr q11, [x0, #736] -str q29, [x0, #432] -sqrdmulh v29.4S, v17.4S, v20.s[0] -mul v17.4S, v17.4S,v8.s[0] -add v7.4s, v7.4s, v26.4s -ldr q26, [x17, #+448] -str q7, [x0, #416] -sqrdmulh v7.4S, v10.4S, v20.s[0] -mul v10.4S, v10.4S,v8.s[0] -sub v21.4s, v22.4s, v19.4s -ldr q28, [x17, #+464] -str q21, [x0, #400] -mla v23.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v4.4S, v28.s[0] -add v22.4s, v22.4s, v19.4s -ldr q19, [x17, #+480] -str q22, [x0, #384] -mla v24.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v5.4S, v28.s[0] -sub v22.4s, v13.4s, v18.4s -ldr q21, [x17, #+496] -str q22, [x0, #496] -mla v17.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v14.4S, v21.s[0] -add v13.4s, v13.4s, v18.4s -ldr q18, [x0, #528] -str q13, [x0, #480] -mla v10.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v11.4S, v21.s[0] -sub v13.4s, v2.4s, v1.4s -ldr q22, [x0, #512] -str q13, [x0, #464] -mul v4.4S, v4.4S,v26.s[0] -mul v5.4S, v5.4S,v26.s[0] -add v2.4s, v2.4s, v1.4s -ldr q1, [x0, #592] -str q2, [x0, #448] -mla v4.4S, v25.4S, v31.s[0] -mla v5.4S, v6.4S, v31.s[0] -sub v6.4s, v18.4s, v23.4s -ldr q25, [x0, #576] -add v18.4s, v18.4s, v23.4s -mul v14.4S, v14.4S,v19.s[0] -mul v11.4S, v11.4S,v19.s[0] -sub v23.4s, v22.4s, v24.4s -ldr q2, [x0, #656] -add v22.4s, v22.4s, v24.4s -mla v14.4S, v29.4S, v31.s[0] -mla v11.4S, v7.4S, v31.s[0] -sub v7.4s, v1.4s, v17.4s -ldr q29, [x0, #640] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v6.4S, v30.s[2] -mul v6.4S, v6.4S,v9.s[2] -sub v24.4s, v25.4s, v10.4s -ldr q13, [x0, #720] -add v25.4s, v25.4s, v10.4s -sqrdmulh v10.4S, v18.4S, v30.s[1] -mul v18.4S, v18.4S,v9.s[1] -sub v16.4s, v2.4s, v4.4s -ldr q27, [x0, #704] -add v2.4s, v2.4s, v4.4s -sqrdmulh v30.4S, v7.4S, v20.s[2] -mul v7.4S, v7.4S,v8.s[2] -sub v4.4s, v29.4s, v5.4s -ldr q9, [x0, #816] -add v29.4s, v29.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v20.s[1] -mul v1.4S, v1.4S,v8.s[1] -sub v3.4s, v13.4s, v14.4s -ldr q15, [x0, #800] -add v13.4s, v13.4s, v14.4s -mla v6.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v16.4S, v28.s[2] -sub v20.4s, v27.4s, v11.4s -ldr q14, [x0, #880] -add v27.4s, v27.4s, v11.4s -mla v18.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v2.4S, v28.s[1] -sub v11.4s, v23.4s, v6.4s -ldr q8, [x0, #864] -str q11, [x0, #560] -mla v7.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v3.4S, v21.s[2] -add v23.4s, v23.4s, v6.4s -ldr q6, [x17, #+512] -str q23, [x0, #544] -mla v1.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v13.4S, v21.s[1] -sub v23.4s, v22.4s, v18.4s -ldr q11, [x17, #+528] -str q23, [x0, #528] -mul v16.4S, v16.4S,v26.s[2] -mul v2.4S, v2.4S,v26.s[1] -add v22.4s, v22.4s, v18.4s -ldr q18, [x17, #+544] -str q22, [x0, #512] -mla v16.4S, v17.4S, v31.s[0] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v24.4s, v7.4s -ldr q17, [x17, #+560] -str q10, [x0, #624] -mul v3.4S, v3.4S,v19.s[2] -mul v13.4S, v13.4S,v19.s[1] -add v24.4s, v24.4s, v7.4s -ldr q7, [x0, #944] -str q24, [x0, #608] -mla v3.4S, v30.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v25.4s, v1.4s -ldr q30, [x0, #928] -str q5, [x0, #592] -sqrdmulh v21.4S, v9.4S, v11.s[0] -mul v9.4S, v9.4S,v6.s[0] -add v25.4s, v25.4s, v1.4s -ldr q1, [x0, #1008] -str q25, [x0, #576] -sqrdmulh v25.4S, v15.4S, v11.s[0] -mul v15.4S, v15.4S,v6.s[0] -sub v5.4s, v4.4s, v16.4s -ldr q19, [x0, #992] -str q5, [x0, #688] -sqrdmulh v5.4S, v14.4S, v17.s[0] -mul v14.4S, v14.4S,v18.s[0] -add v4.4s, v4.4s, v16.4s -ldr q16, [x17, #+576] -str q4, [x0, #672] -sqrdmulh v4.4S, v8.4S, v17.s[0] -mul v8.4S, v8.4S,v18.s[0] -sub v24.4s, v29.4s, v2.4s -ldr q28, [x17, #+592] -str q24, [x0, #656] -mla v9.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v7.4S, v28.s[0] -add v29.4s, v29.4s, v2.4s -ldr q2, [x17, #+608] -str q29, [x0, #640] -mla v15.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v30.4S, v28.s[0] -sub v29.4s, v20.4s, v3.4s -ldr q24, [x17, #+624] -str q29, [x0, #752] -mla v14.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v1.4S, v24.s[0] -add v20.4s, v20.4s, v3.4s -ldr q3, [x0, #784] -str q20, [x0, #736] -mla v8.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v19.4S, v24.s[0] -sub v20.4s, v27.4s, v13.4s -ldr q29, [x0, #768] -str q20, [x0, #720] -mul v7.4S, v7.4S,v16.s[0] -mul v30.4S, v30.4S,v16.s[0] -add v27.4s, v27.4s, v13.4s -ldr q13, [x0, #848] -str q27, [x0, #704] -mla v7.4S, v21.4S, v31.s[0] -mla v30.4S, v25.4S, v31.s[0] -sub v25.4s, v3.4s, v9.4s -ldr q21, [x0, #832] -add v3.4s, v3.4s, v9.4s -mul v1.4S, v1.4S,v2.s[0] -mul v19.4S, v19.4S,v2.s[0] -sub v9.4s, v29.4s, v15.4s -ldr q27, [x0, #912] -add v29.4s, v29.4s, v15.4s -mla v1.4S, v5.4S, v31.s[0] -mla v19.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v14.4s -ldr q5, [x0, #896] -add v13.4s, v13.4s, v14.4s -sqrdmulh v14.4S, v25.4S, v11.s[2] -mul v25.4S, v25.4S,v6.s[2] -sub v15.4s, v21.4s, v8.4s -ldr q20, [x0, #976] -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v3.4S, v11.s[1] -mul v3.4S, v3.4S,v6.s[1] -sub v10.4s, v27.4s, v7.4s -ldr q26, [x0, #960] -add v27.4s, v27.4s, v7.4s -sqrdmulh v11.4S, v4.4S, v17.s[2] -mul v4.4S, v4.4S,v18.s[2] -sub v7.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v13.4S, v17.s[1] -mul v13.4S, v13.4S,v18.s[1] -sub v6.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -mla v25.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v10.4S, v28.s[2] -sub v17.4s, v26.4s, v19.4s -add v26.4s, v26.4s, v19.4s -mla v3.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v27.4S, v28.s[1] -sub v19.4s, v9.4s, v25.4s -str q19, [x0, #816] -mla v4.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v6.4S, v24.s[2] -add v9.4s, v9.4s, v25.4s -str q9, [x0, #800] -mla v13.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v20.4S, v24.s[1] -sub v9.4s, v29.4s, v3.4s -str q9, [x0, #784] -mul v10.4S, v10.4S,v16.s[2] -mul v27.4S, v27.4S,v16.s[1] -add v29.4s, v29.4s, v3.4s -str q29, [x0, #768] -mla v10.4S, v14.4S, v31.s[0] -mla v27.4S, v8.4S, v31.s[0] -sub v8.4s, v15.4s, v4.4s -str q8, [x0, #880] -mul v6.4S, v6.4S,v2.s[2] -mul v20.4S, v20.4S,v2.s[1] -add v15.4s, v15.4s, v4.4s -str q15, [x0, #864] -mla v6.4S, v11.4S, v31.s[0] -mla v20.4S, v30.4S, v31.s[0] -sub v30.4s, v21.4s, v13.4s -str q30, [x0, #848] -add v21.4s, v21.4s, v13.4s -str q21, [x0, #832] -sub v21.4s, v7.4s, v10.4s -str q21, [x0, #944] -add v7.4s, v7.4s, v10.4s -str q7, [x0, #928] -sub v7.4s, v5.4s, v27.4s -str q7, [x0, #912] -add v5.4s, v5.4s, v27.4s -str q5, [x0, #896] -sub v5.4s, v17.4s, v6.4s -str q5, [x0, #1008] -add v17.4s, v17.4s, v6.4s -str q17, [x0, #992] -sub v17.4s, v26.4s, v20.4s -str q17, [x0, #976] -add v26.4s, v26.4s, v20.4s -str q26, [x0, #960] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s deleted file mode 100644 index e12e06a..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_13.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_13: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #32] -ldr q25, [x0, #48] -ldr q13, [x0, #96] -ldr q26, [x0, #112] -ldr q15, [x17, #+128] -ldr q29, [x17, #+144] -ldr q16, [x17, #+160] -ldr q1, [x17, #+176] -ldr q4, [x0, #160] -ldr q22, [x0, #176] -sqrdmulh v9.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v15.s[0] -ldr q20, [x0, #224] -sqrdmulh v23.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v15.s[0] -ldr q5, [x0, #240] -sqrdmulh v3.4S, v13.4S, v1.s[0] -mul v13.4S, v13.4S,v16.s[0] -ldr q2, [x17, #+192] -sqrdmulh v7.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v16.s[0] -ldr q28, [x17, #+208] -mla v24.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v4.4S, v28.s[0] -ldr q30, [x17, #+224] -mla v25.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v22.4S, v28.s[0] -ldr q6, [x17, #+240] -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v20.4S, v6.s[0] -ldr q18, [x0, #0] -mla v26.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v5.4S, v6.s[0] -mul v4.4S, v4.4S,v2.s[0] -mul v22.4S, v22.4S,v2.s[0] -sub v27.4s, v18.4s, v24.4s -ldr q19, [x0, #64] -add v18.4s, v18.4s, v24.4s -mla v4.4S, v9.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v10.4s, v25.4s -add v10.4s, v10.4s, v25.4s -mul v20.4S, v20.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -sub v25.4s, v19.4s, v13.4s -ldr q9, [x0, #128] -add v19.4s, v19.4s, v13.4s -mla v20.4S, v3.4S, v31.s[0] -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v26.4s -add v17.4s, v17.4s, v26.4s -sqrdmulh v26.4S, v10.4S, v29.s[1] -mul v10.4S, v10.4S,v15.s[1] -sub v3.4s, v9.4s, v4.4s -ldr q13, [x0, #192] -add v9.4s, v9.4s, v4.4s -sqrdmulh v4.4S, v23.4S, v29.s[2] -mul v23.4S, v23.4S,v15.s[2] -sub v24.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v29.4S, v17.4S, v1.s[1] -mul v17.4S, v17.4S,v16.s[1] -sub v22.4s, v13.4s, v20.4s -ldr q15, [x0, #288] -add v13.4s, v13.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v1.s[2] -mul v7.4S, v7.4S,v16.s[2] -sub v8.4s, v14.4s, v5.4s -ldr q21, [x0, #304] -add v14.4s, v14.4s, v5.4s -mla v10.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v11.4S, v28.s[1] -sub v1.4s, v18.4s, v10.4s -ldr q5, [x0, #352] -str q1, [x0, #16] -mla v23.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v24.4S, v28.s[2] -add v18.4s, v18.4s, v10.4s -ldr q10, [x0, #368] -str q18, [x0, #0] -mla v17.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v14.4S, v6.s[1] -sub v18.4s, v27.4s, v23.4s -ldr q1, [x17, #+256] -str q18, [x0, #48] -mla v7.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v8.4S, v6.s[2] -add v27.4s, v27.4s, v23.4s -ldr q23, [x17, #+272] -str q27, [x0, #32] -mul v11.4S, v11.4S,v2.s[1] -mul v24.4S, v24.4S,v2.s[2] -sub v27.4s, v19.4s, v17.4s -ldr q18, [x17, #+288] -str q27, [x0, #80] -mla v11.4S, v26.4S, v31.s[0] -mla v24.4S, v4.4S, v31.s[0] -add v19.4s, v19.4s, v17.4s -ldr q17, [x17, #+304] -str q19, [x0, #64] -mul v14.4S, v14.4S,v30.s[1] -mul v8.4S, v8.4S,v30.s[2] -sub v28.4s, v25.4s, v7.4s -ldr q19, [x0, #416] -str q28, [x0, #112] -mla v14.4S, v29.4S, v31.s[0] -mla v8.4S, v20.4S, v31.s[0] -add v25.4s, v25.4s, v7.4s -ldr q7, [x0, #432] -str q25, [x0, #96] -sqrdmulh v6.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v1.s[0] -sub v25.4s, v9.4s, v11.4s -ldr q30, [x0, #480] -str q25, [x0, #144] -sqrdmulh v25.4S, v21.4S, v23.s[0] -mul v21.4S, v21.4S,v1.s[0] -add v9.4s, v9.4s, v11.4s -ldr q11, [x0, #496] -str q9, [x0, #128] -sqrdmulh v9.4S, v5.4S, v17.s[0] -mul v5.4S, v5.4S,v18.s[0] -sub v20.4s, v3.4s, v24.4s -ldr q29, [x17, #+320] -str q20, [x0, #176] -sqrdmulh v20.4S, v10.4S, v17.s[0] -mul v10.4S, v10.4S,v18.s[0] -add v3.4s, v3.4s, v24.4s -ldr q24, [x17, #+336] -str q3, [x0, #160] -mla v15.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v19.4S, v24.s[0] -sub v3.4s, v13.4s, v14.4s -ldr q28, [x17, #+352] -str q3, [x0, #208] -mla v21.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v7.4S, v24.s[0] -add v13.4s, v13.4s, v14.4s -ldr q14, [x17, #+368] -str q13, [x0, #192] -mla v5.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v30.4S, v14.s[0] -sub v13.4s, v22.4s, v8.4s -ldr q3, [x0, #256] -str q13, [x0, #240] -mla v10.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v11.4S, v14.s[0] -add v22.4s, v22.4s, v8.4s -ldr q8, [x0, #272] -str q22, [x0, #224] -mul v19.4S, v19.4S,v29.s[0] -mul v7.4S, v7.4S,v29.s[0] -sub v22.4s, v3.4s, v15.4s -ldr q13, [x0, #320] -add v3.4s, v3.4s, v15.4s -mla v19.4S, v6.4S, v31.s[0] -mla v7.4S, v25.4S, v31.s[0] -sub v25.4s, v8.4s, v21.4s -ldr q6, [x0, #336] -add v8.4s, v8.4s, v21.4s -mul v30.4S, v30.4S,v28.s[0] -mul v11.4S, v11.4S,v28.s[0] -sub v21.4s, v13.4s, v5.4s -ldr q15, [x0, #384] -add v13.4s, v13.4s, v5.4s -mla v30.4S, v9.4S, v31.s[0] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v10.4s -ldr q9, [x0, #400] -add v6.4s, v6.4s, v10.4s -sqrdmulh v10.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v5.4s, v15.4s, v19.4s -ldr q2, [x0, #448] -add v15.4s, v15.4s, v19.4s -sqrdmulh v19.4S, v25.4S, v23.s[2] -mul v25.4S, v25.4S,v1.s[2] -sub v4.4s, v9.4s, v7.4s -ldr q26, [x0, #464] -add v9.4s, v9.4s, v7.4s -sqrdmulh v23.4S, v6.4S, v17.s[1] -mul v6.4S, v6.4S,v18.s[1] -sub v7.4s, v2.4s, v30.4s -ldr q1, [x0, #544] -add v2.4s, v2.4s, v30.4s -sqrdmulh v30.4S, v20.4S, v17.s[2] -mul v20.4S, v20.4S,v18.s[2] -sub v27.4s, v26.4s, v11.4s -ldr q16, [x0, #560] -add v26.4s, v26.4s, v11.4s -mla v8.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v9.4S, v24.s[1] -sub v17.4s, v3.4s, v8.4s -ldr q11, [x0, #608] -str q17, [x0, #272] -mla v25.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v4.4S, v24.s[2] -add v3.4s, v3.4s, v8.4s -ldr q8, [x0, #624] -str q3, [x0, #256] -mla v6.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v26.4S, v14.s[1] -sub v3.4s, v22.4s, v25.4s -ldr q17, [x17, #+384] -str q3, [x0, #304] -mla v20.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v27.4S, v14.s[2] -add v22.4s, v22.4s, v25.4s -ldr q25, [x17, #+400] -str q22, [x0, #288] -mul v9.4S, v9.4S,v29.s[1] -mul v4.4S, v4.4S,v29.s[2] -sub v22.4s, v13.4s, v6.4s -ldr q3, [x17, #+416] -str q22, [x0, #336] -mla v9.4S, v10.4S, v31.s[0] -mla v4.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -ldr q6, [x17, #+432] -str q13, [x0, #320] -mul v26.4S, v26.4S,v28.s[1] -mul v27.4S, v27.4S,v28.s[2] -sub v24.4s, v21.4s, v20.4s -ldr q13, [x0, #672] -str q24, [x0, #368] -mla v26.4S, v23.4S, v31.s[0] -mla v27.4S, v30.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -ldr q20, [x0, #688] -str q21, [x0, #352] -sqrdmulh v14.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v17.s[0] -sub v21.4s, v15.4s, v9.4s -ldr q28, [x0, #736] -str q21, [x0, #400] -sqrdmulh v21.4S, v16.4S, v25.s[0] -mul v16.4S, v16.4S,v17.s[0] -add v15.4s, v15.4s, v9.4s -ldr q9, [x0, #752] -str q15, [x0, #384] -sqrdmulh v15.4S, v11.4S, v6.s[0] -mul v11.4S, v11.4S,v3.s[0] -sub v30.4s, v5.4s, v4.4s -ldr q23, [x17, #+448] -str q30, [x0, #432] -sqrdmulh v30.4S, v8.4S, v6.s[0] -mul v8.4S, v8.4S,v3.s[0] -add v5.4s, v5.4s, v4.4s -ldr q4, [x17, #+464] -str q5, [x0, #416] -mla v1.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v13.4S, v4.s[0] -sub v5.4s, v2.4s, v26.4s -ldr q24, [x17, #+480] -str q5, [x0, #464] -mla v16.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v20.4S, v4.s[0] -add v2.4s, v2.4s, v26.4s -ldr q26, [x17, #+496] -str q2, [x0, #448] -mla v11.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v28.4S, v26.s[0] -sub v2.4s, v7.4s, v27.4s -ldr q5, [x0, #512] -str q2, [x0, #496] -mla v8.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v9.4S, v26.s[0] -add v7.4s, v7.4s, v27.4s -ldr q27, [x0, #528] -str q7, [x0, #480] -mul v13.4S, v13.4S,v23.s[0] -mul v20.4S, v20.4S,v23.s[0] -sub v7.4s, v5.4s, v1.4s -ldr q2, [x0, #576] -add v5.4s, v5.4s, v1.4s -mla v13.4S, v14.4S, v31.s[0] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v27.4s, v16.4s -ldr q14, [x0, #592] -add v27.4s, v27.4s, v16.4s -mul v28.4S, v28.4S,v24.s[0] -mul v9.4S, v9.4S,v24.s[0] -sub v16.4s, v2.4s, v11.4s -ldr q1, [x0, #640] -add v2.4s, v2.4s, v11.4s -mla v28.4S, v15.4S, v31.s[0] -mla v9.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v8.4s -ldr q15, [x0, #656] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v27.4S, v25.s[1] -mul v27.4S, v27.4S,v17.s[1] -sub v11.4s, v1.4s, v13.4s -ldr q29, [x0, #704] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v17.s[2] -sub v19.4s, v15.4s, v20.4s -ldr q10, [x0, #720] -add v15.4s, v15.4s, v20.4s -sqrdmulh v25.4S, v14.4S, v6.s[1] -mul v14.4S, v14.4S,v3.s[1] -sub v20.4s, v29.4s, v28.4s -ldr q17, [x0, #800] -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v30.4S, v6.s[2] -mul v30.4S, v30.4S,v3.s[2] -sub v22.4s, v10.4s, v9.4s -ldr q18, [x0, #816] -add v10.4s, v10.4s, v9.4s -mla v27.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v15.4S, v4.s[1] -sub v6.4s, v5.4s, v27.4s -ldr q9, [x0, #864] -str q6, [x0, #528] -mla v21.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v19.4S, v4.s[2] -add v5.4s, v5.4s, v27.4s -ldr q27, [x0, #880] -str q5, [x0, #512] -mla v14.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v10.4S, v26.s[1] -sub v5.4s, v7.4s, v21.4s -ldr q6, [x17, #+512] -str q5, [x0, #560] -mla v30.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v22.4S, v26.s[2] -add v7.4s, v7.4s, v21.4s -ldr q21, [x17, #+528] -str q7, [x0, #544] -mul v15.4S, v15.4S,v23.s[1] -mul v19.4S, v19.4S,v23.s[2] -sub v7.4s, v2.4s, v14.4s -ldr q5, [x17, #+544] -str q7, [x0, #592] -mla v15.4S, v8.4S, v31.s[0] -mla v19.4S, v13.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -ldr q14, [x17, #+560] -str q2, [x0, #576] -mul v10.4S, v10.4S,v24.s[1] -mul v22.4S, v22.4S,v24.s[2] -sub v4.4s, v16.4s, v30.4s -ldr q2, [x0, #928] -str q4, [x0, #624] -mla v10.4S, v25.4S, v31.s[0] -mla v22.4S, v28.4S, v31.s[0] -add v16.4s, v16.4s, v30.4s -ldr q30, [x0, #944] -str q16, [x0, #608] -sqrdmulh v26.4S, v17.4S, v21.s[0] -mul v17.4S, v17.4S,v6.s[0] -sub v16.4s, v1.4s, v15.4s -ldr q24, [x0, #992] -str q16, [x0, #656] -sqrdmulh v16.4S, v18.4S, v21.s[0] -mul v18.4S, v18.4S,v6.s[0] -add v1.4s, v1.4s, v15.4s -ldr q15, [x0, #1008] -str q1, [x0, #640] -sqrdmulh v1.4S, v9.4S, v14.s[0] -mul v9.4S, v9.4S,v5.s[0] -sub v28.4s, v11.4s, v19.4s -ldr q25, [x17, #+576] -str q28, [x0, #688] -sqrdmulh v28.4S, v27.4S, v14.s[0] -mul v27.4S, v27.4S,v5.s[0] -add v11.4s, v11.4s, v19.4s -ldr q19, [x17, #+592] -str q11, [x0, #672] -mla v17.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v2.4S, v19.s[0] -sub v11.4s, v29.4s, v10.4s -ldr q4, [x17, #+608] -str q11, [x0, #720] -mla v18.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v30.4S, v19.s[0] -add v29.4s, v29.4s, v10.4s -ldr q10, [x17, #+624] -str q29, [x0, #704] -mla v9.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v24.4S, v10.s[0] -sub v29.4s, v20.4s, v22.4s -ldr q11, [x0, #768] -str q29, [x0, #752] -mla v27.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v15.4S, v10.s[0] -add v20.4s, v20.4s, v22.4s -ldr q22, [x0, #784] -str q20, [x0, #736] -mul v2.4S, v2.4S,v25.s[0] -mul v30.4S, v30.4S,v25.s[0] -sub v20.4s, v11.4s, v17.4s -ldr q29, [x0, #832] -add v11.4s, v11.4s, v17.4s -mla v2.4S, v26.4S, v31.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v22.4s, v18.4s -ldr q26, [x0, #848] -add v22.4s, v22.4s, v18.4s -mul v24.4S, v24.4S,v4.s[0] -mul v15.4S, v15.4S,v4.s[0] -sub v18.4s, v29.4s, v9.4s -ldr q17, [x0, #896] -add v29.4s, v29.4s, v9.4s -mla v24.4S, v1.4S, v31.s[0] -mla v15.4S, v28.4S, v31.s[0] -sub v28.4s, v26.4s, v27.4s -ldr q1, [x0, #912] -add v26.4s, v26.4s, v27.4s -sqrdmulh v27.4S, v22.4S, v21.s[1] -mul v22.4S, v22.4S,v6.s[1] -sub v9.4s, v17.4s, v2.4s -ldr q23, [x0, #960] -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v16.4S, v21.s[2] -mul v16.4S, v16.4S,v6.s[2] -sub v13.4s, v1.4s, v30.4s -ldr q8, [x0, #976] -add v1.4s, v1.4s, v30.4s -sqrdmulh v21.4S, v26.4S, v14.s[1] -mul v26.4S, v26.4S,v5.s[1] -sub v30.4s, v23.4s, v24.4s -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v28.4S, v14.s[2] -mul v28.4S, v28.4S,v5.s[2] -sub v6.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v22.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v1.4S, v19.s[1] -sub v14.4s, v11.4s, v22.4s -str q14, [x0, #784] -mla v16.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v13.4S, v19.s[2] -add v11.4s, v11.4s, v22.4s -str q11, [x0, #768] -mla v26.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v8.4S, v10.s[1] -sub v11.4s, v20.4s, v16.4s -str q11, [x0, #816] -mla v28.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v6.4S, v10.s[2] -add v20.4s, v20.4s, v16.4s -str q20, [x0, #800] -mul v1.4S, v1.4S,v25.s[1] -mul v13.4S, v13.4S,v25.s[2] -sub v20.4s, v29.4s, v26.4s -str q20, [x0, #848] -mla v1.4S, v27.4S, v31.s[0] -mla v13.4S, v2.4S, v31.s[0] -add v29.4s, v29.4s, v26.4s -str q29, [x0, #832] -mul v8.4S, v8.4S,v4.s[1] -mul v6.4S, v6.4S,v4.s[2] -sub v19.4s, v18.4s, v28.4s -str q19, [x0, #880] -mla v8.4S, v21.4S, v31.s[0] -mla v6.4S, v24.4S, v31.s[0] -add v18.4s, v18.4s, v28.4s -str q18, [x0, #864] -sub v10.4s, v17.4s, v1.4s -str q10, [x0, #912] -add v17.4s, v17.4s, v1.4s -str q17, [x0, #896] -sub v17.4s, v9.4s, v13.4s -str q17, [x0, #944] -add v9.4s, v9.4s, v13.4s -str q9, [x0, #928] -sub v9.4s, v23.4s, v8.4s -str q9, [x0, #976] -add v23.4s, v23.4s, v8.4s -str q23, [x0, #960] -sub v23.4s, v30.4s, v6.4s -str q23, [x0, #1008] -add v30.4s, v30.4s, v6.4s -str q30, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s deleted file mode 100644 index 9604c58..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_14.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_14: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #48] -ldr q25, [x0, #32] -ldr q13, [x0, #112] -ldr q26, [x0, #96] -ldr q15, [x17, #+128] -ldr q29, [x17, #+144] -ldr q16, [x17, #+160] -ldr q1, [x17, #+176] -ldr q4, [x0, #176] -ldr q22, [x0, #160] -sqrdmulh v9.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v15.s[0] -ldr q20, [x0, #240] -sqrdmulh v23.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v15.s[0] -ldr q5, [x0, #224] -sqrdmulh v3.4S, v13.4S, v1.s[0] -mul v13.4S, v13.4S,v16.s[0] -ldr q2, [x17, #+192] -sqrdmulh v7.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v16.s[0] -ldr q28, [x17, #+208] -mla v24.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v4.4S, v28.s[0] -ldr q30, [x17, #+224] -mla v25.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v22.4S, v28.s[0] -ldr q6, [x17, #+240] -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v20.4S, v6.s[0] -mla v26.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v5.4S, v6.s[0] -ldr q18, [x0, #0] -mul v4.4S, v4.4S,v2.s[0] -mul v22.4S, v22.4S,v2.s[0] -sub v27.4s, v10.4s, v24.4s -add v10.4s, v10.4s, v24.4s -mla v4.4S, v9.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -sub v23.4s, v18.4s, v25.4s -ldr q9, [x0, #64] -add v18.4s, v18.4s, v25.4s -mul v20.4S, v20.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -sub v25.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -mla v20.4S, v3.4S, v31.s[0] -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v26.4s -ldr q3, [x0, #128] -add v9.4s, v9.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v15.s[2] -sub v13.4s, v11.4s, v4.4s -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v29.s[1] -mul v10.4S, v10.4S,v15.s[1] -sub v24.4s, v3.4s, v22.4s -ldr q19, [x0, #192] -add v3.4s, v3.4s, v22.4s -sqrdmulh v29.4S, v25.4S, v1.s[2] -mul v25.4S, v25.4S,v16.s[2] -sub v22.4s, v14.4s, v20.4s -ldr q15, [x0, #304] -add v14.4s, v14.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v1.s[1] -mul v17.4S, v17.4S,v16.s[1] -sub v8.4s, v19.4s, v5.4s -ldr q21, [x0, #288] -add v19.4s, v19.4s, v5.4s -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v13.4S, v28.s[2] -sub v1.4s, v23.4s, v27.4s -ldr q5, [x0, #368] -str q1, [x0, #48] -mla v10.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v11.4S, v28.s[1] -add v23.4s, v23.4s, v27.4s -ldr q27, [x0, #352] -str q23, [x0, #32] -mla v25.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v22.4S, v6.s[2] -sub v23.4s, v18.4s, v10.4s -ldr q1, [x17, #+256] -str q23, [x0, #16] -mla v17.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v14.4S, v6.s[1] -add v18.4s, v18.4s, v10.4s -ldr q10, [x17, #+272] -str q18, [x0, #0] -mul v13.4S, v13.4S,v2.s[2] -mul v11.4S, v11.4S,v2.s[1] -sub v18.4s, v7.4s, v25.4s -ldr q23, [x17, #+288] -str q18, [x0, #112] -mla v13.4S, v26.4S, v31.s[0] -mla v11.4S, v4.4S, v31.s[0] -add v7.4s, v7.4s, v25.4s -ldr q25, [x17, #+304] -str q7, [x0, #96] -mul v22.4S, v22.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[1] -sub v28.4s, v9.4s, v17.4s -ldr q7, [x0, #432] -str q28, [x0, #80] -mla v22.4S, v29.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -ldr q17, [x0, #416] -str q9, [x0, #64] -sqrdmulh v6.4S, v15.4S, v10.s[0] -mul v15.4S, v15.4S,v1.s[0] -sub v9.4s, v24.4s, v13.4s -ldr q30, [x0, #496] -str q9, [x0, #176] -sqrdmulh v9.4S, v21.4S, v10.s[0] -mul v21.4S, v21.4S,v1.s[0] -add v24.4s, v24.4s, v13.4s -ldr q13, [x0, #480] -str q24, [x0, #160] -sqrdmulh v24.4S, v5.4S, v25.s[0] -mul v5.4S, v5.4S,v23.s[0] -sub v20.4s, v3.4s, v11.4s -ldr q29, [x17, #+320] -str q20, [x0, #144] -sqrdmulh v20.4S, v27.4S, v25.s[0] -mul v27.4S, v27.4S,v23.s[0] -add v3.4s, v3.4s, v11.4s -ldr q11, [x17, #+336] -str q3, [x0, #128] -mla v15.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v7.4S, v11.s[0] -sub v3.4s, v8.4s, v22.4s -ldr q28, [x17, #+352] -str q3, [x0, #240] -mla v21.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v17.4S, v11.s[0] -add v8.4s, v8.4s, v22.4s -ldr q22, [x17, #+368] -str q8, [x0, #224] -mla v5.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v30.4S, v22.s[0] -sub v8.4s, v19.4s, v14.4s -ldr q3, [x0, #272] -str q8, [x0, #208] -mla v27.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v13.4S, v22.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #256] -str q19, [x0, #192] -mul v7.4S, v7.4S,v29.s[0] -mul v17.4S, v17.4S,v29.s[0] -sub v19.4s, v3.4s, v15.4s -ldr q8, [x0, #336] -add v3.4s, v3.4s, v15.4s -mla v7.4S, v6.4S, v31.s[0] -mla v17.4S, v9.4S, v31.s[0] -sub v9.4s, v14.4s, v21.4s -ldr q6, [x0, #320] -add v14.4s, v14.4s, v21.4s -mul v30.4S, v30.4S,v28.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v21.4s, v8.4s, v5.4s -ldr q15, [x0, #400] -add v8.4s, v8.4s, v5.4s -mla v30.4S, v24.4S, v31.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v27.4s -ldr q24, [x0, #384] -add v6.4s, v6.4s, v27.4s -sqrdmulh v27.4S, v19.4S, v10.s[2] -mul v19.4S, v19.4S,v1.s[2] -sub v5.4s, v15.4s, v7.4s -ldr q2, [x0, #464] -add v15.4s, v15.4s, v7.4s -sqrdmulh v7.4S, v3.4S, v10.s[1] -mul v3.4S, v3.4S,v1.s[1] -sub v4.4s, v24.4s, v17.4s -ldr q26, [x0, #448] -add v24.4s, v24.4s, v17.4s -sqrdmulh v10.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v23.s[2] -sub v17.4s, v2.4s, v30.4s -ldr q1, [x0, #560] -add v2.4s, v2.4s, v30.4s -sqrdmulh v30.4S, v8.4S, v25.s[1] -mul v8.4S, v8.4S,v23.s[1] -sub v18.4s, v26.4s, v13.4s -ldr q16, [x0, #544] -add v26.4s, v26.4s, v13.4s -mla v19.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v5.4S, v11.s[2] -sub v25.4s, v9.4s, v19.4s -ldr q13, [x0, #624] -str q25, [x0, #304] -mla v3.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v15.4S, v11.s[1] -add v9.4s, v9.4s, v19.4s -ldr q19, [x0, #608] -str q9, [x0, #288] -mla v21.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v17.4S, v22.s[2] -sub v9.4s, v14.4s, v3.4s -ldr q25, [x17, #+384] -str q9, [x0, #272] -mla v8.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v2.4S, v22.s[1] -add v14.4s, v14.4s, v3.4s -ldr q3, [x17, #+400] -str q14, [x0, #256] -mul v5.4S, v5.4S,v29.s[2] -mul v15.4S, v15.4S,v29.s[1] -sub v14.4s, v20.4s, v21.4s -ldr q9, [x17, #+416] -str q14, [x0, #368] -mla v5.4S, v27.4S, v31.s[0] -mla v15.4S, v7.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -ldr q21, [x17, #+432] -str q20, [x0, #352] -mul v17.4S, v17.4S,v28.s[2] -mul v2.4S, v2.4S,v28.s[1] -sub v11.4s, v6.4s, v8.4s -ldr q20, [x0, #688] -str q11, [x0, #336] -mla v17.4S, v10.4S, v31.s[0] -mla v2.4S, v30.4S, v31.s[0] -add v6.4s, v6.4s, v8.4s -ldr q8, [x0, #672] -str q6, [x0, #320] -sqrdmulh v22.4S, v1.4S, v3.s[0] -mul v1.4S, v1.4S,v25.s[0] -sub v6.4s, v4.4s, v5.4s -ldr q28, [x0, #752] -str q6, [x0, #432] -sqrdmulh v6.4S, v16.4S, v3.s[0] -mul v16.4S, v16.4S,v25.s[0] -add v4.4s, v4.4s, v5.4s -ldr q5, [x0, #736] -str q4, [x0, #416] -sqrdmulh v4.4S, v13.4S, v21.s[0] -mul v13.4S, v13.4S,v9.s[0] -sub v30.4s, v24.4s, v15.4s -ldr q10, [x17, #+448] -str q30, [x0, #400] -sqrdmulh v30.4S, v19.4S, v21.s[0] -mul v19.4S, v19.4S,v9.s[0] -add v24.4s, v24.4s, v15.4s -ldr q15, [x17, #+464] -str q24, [x0, #384] -mla v1.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v20.4S, v15.s[0] -sub v24.4s, v18.4s, v17.4s -ldr q11, [x17, #+480] -str q24, [x0, #496] -mla v16.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v8.4S, v15.s[0] -add v18.4s, v18.4s, v17.4s -ldr q17, [x17, #+496] -str q18, [x0, #480] -mla v13.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v28.4S, v17.s[0] -sub v18.4s, v26.4s, v2.4s -ldr q24, [x0, #528] -str q18, [x0, #464] -mla v19.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v5.4S, v17.s[0] -add v26.4s, v26.4s, v2.4s -ldr q2, [x0, #512] -str q26, [x0, #448] -mul v20.4S, v20.4S,v10.s[0] -mul v8.4S, v8.4S,v10.s[0] -sub v26.4s, v24.4s, v1.4s -ldr q18, [x0, #592] -add v24.4s, v24.4s, v1.4s -mla v20.4S, v22.4S, v31.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v2.4s, v16.4s -ldr q22, [x0, #576] -add v2.4s, v2.4s, v16.4s -mul v28.4S, v28.4S,v11.s[0] -mul v5.4S, v5.4S,v11.s[0] -sub v16.4s, v18.4s, v13.4s -ldr q1, [x0, #656] -add v18.4s, v18.4s, v13.4s -mla v28.4S, v4.4S, v31.s[0] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v22.4s, v19.4s -ldr q4, [x0, #640] -add v22.4s, v22.4s, v19.4s -sqrdmulh v19.4S, v26.4S, v3.s[2] -mul v26.4S, v26.4S,v25.s[2] -sub v13.4s, v1.4s, v20.4s -ldr q29, [x0, #720] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v24.4S, v3.s[1] -mul v24.4S, v24.4S,v25.s[1] -sub v7.4s, v4.4s, v8.4s -ldr q27, [x0, #704] -add v4.4s, v4.4s, v8.4s -sqrdmulh v3.4S, v16.4S, v21.s[2] -mul v16.4S, v16.4S,v9.s[2] -sub v8.4s, v29.4s, v28.4s -ldr q25, [x0, #816] -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v18.4S, v21.s[1] -mul v18.4S, v18.4S,v9.s[1] -sub v14.4s, v27.4s, v5.4s -ldr q23, [x0, #800] -add v27.4s, v27.4s, v5.4s -mla v26.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v13.4S, v15.s[2] -sub v21.4s, v6.4s, v26.4s -ldr q5, [x0, #880] -str q21, [x0, #560] -mla v24.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v1.4S, v15.s[1] -add v6.4s, v6.4s, v26.4s -ldr q26, [x0, #864] -str q6, [x0, #544] -mla v16.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v8.4S, v17.s[2] -sub v6.4s, v2.4s, v24.4s -ldr q21, [x17, #+512] -str q6, [x0, #528] -mla v18.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v29.4S, v17.s[1] -add v2.4s, v2.4s, v24.4s -ldr q24, [x17, #+528] -str q2, [x0, #512] -mul v13.4S, v13.4S,v10.s[2] -mul v1.4S, v1.4S,v10.s[1] -sub v2.4s, v30.4s, v16.4s -ldr q6, [x17, #+544] -str q2, [x0, #624] -mla v13.4S, v19.4S, v31.s[0] -mla v1.4S, v20.4S, v31.s[0] -add v30.4s, v30.4s, v16.4s -ldr q16, [x17, #+560] -str q30, [x0, #608] -mul v8.4S, v8.4S,v11.s[2] -mul v29.4S, v29.4S,v11.s[1] -sub v15.4s, v22.4s, v18.4s -ldr q30, [x0, #944] -str q15, [x0, #592] -mla v8.4S, v3.4S, v31.s[0] -mla v29.4S, v28.4S, v31.s[0] -add v22.4s, v22.4s, v18.4s -ldr q18, [x0, #928] -str q22, [x0, #576] -sqrdmulh v17.4S, v25.4S, v24.s[0] -mul v25.4S, v25.4S,v21.s[0] -sub v22.4s, v7.4s, v13.4s -ldr q11, [x0, #1008] -str q22, [x0, #688] -sqrdmulh v22.4S, v23.4S, v24.s[0] -mul v23.4S, v23.4S,v21.s[0] -add v7.4s, v7.4s, v13.4s -ldr q13, [x0, #992] -str q7, [x0, #672] -sqrdmulh v7.4S, v5.4S, v16.s[0] -mul v5.4S, v5.4S,v6.s[0] -sub v28.4s, v4.4s, v1.4s -ldr q3, [x17, #+576] -str q28, [x0, #656] -sqrdmulh v28.4S, v26.4S, v16.s[0] -mul v26.4S, v26.4S,v6.s[0] -add v4.4s, v4.4s, v1.4s -ldr q1, [x17, #+592] -str q4, [x0, #640] -mla v25.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v30.4S, v1.s[0] -sub v4.4s, v14.4s, v8.4s -ldr q15, [x17, #+608] -str q4, [x0, #752] -mla v23.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v18.4S, v1.s[0] -add v14.4s, v14.4s, v8.4s -ldr q8, [x17, #+624] -str q14, [x0, #736] -mla v5.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v11.4S, v8.s[0] -sub v14.4s, v27.4s, v29.4s -ldr q4, [x0, #784] -str q14, [x0, #720] -mla v26.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v13.4S, v8.s[0] -add v27.4s, v27.4s, v29.4s -ldr q29, [x0, #768] -str q27, [x0, #704] -mul v30.4S, v30.4S,v3.s[0] -mul v18.4S, v18.4S,v3.s[0] -sub v27.4s, v4.4s, v25.4s -ldr q14, [x0, #848] -add v4.4s, v4.4s, v25.4s -mla v30.4S, v17.4S, v31.s[0] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v29.4s, v23.4s -ldr q17, [x0, #832] -add v29.4s, v29.4s, v23.4s -mul v11.4S, v11.4S,v15.s[0] -mul v13.4S, v13.4S,v15.s[0] -sub v23.4s, v14.4s, v5.4s -ldr q25, [x0, #912] -add v14.4s, v14.4s, v5.4s -mla v11.4S, v7.4S, v31.s[0] -mla v13.4S, v28.4S, v31.s[0] -sub v28.4s, v17.4s, v26.4s -ldr q7, [x0, #896] -add v17.4s, v17.4s, v26.4s -sqrdmulh v26.4S, v27.4S, v24.s[2] -mul v27.4S, v27.4S,v21.s[2] -sub v5.4s, v25.4s, v30.4s -ldr q10, [x0, #976] -add v25.4s, v25.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v24.s[1] -mul v4.4S, v4.4S,v21.s[1] -sub v20.4s, v7.4s, v18.4s -ldr q19, [x0, #960] -add v7.4s, v7.4s, v18.4s -sqrdmulh v24.4S, v23.4S, v16.s[2] -mul v23.4S, v23.4S,v6.s[2] -sub v18.4s, v10.4s, v11.4s -add v10.4s, v10.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v16.s[1] -mul v14.4S, v14.4S,v6.s[1] -sub v21.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v5.4S, v1.s[2] -sub v16.4s, v22.4s, v27.4s -str q16, [x0, #816] -mla v4.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v25.4S, v1.s[1] -add v22.4s, v22.4s, v27.4s -str q22, [x0, #800] -mla v23.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v18.4S, v8.s[2] -sub v22.4s, v29.4s, v4.4s -str q22, [x0, #784] -mla v14.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v10.4S, v8.s[1] -add v29.4s, v29.4s, v4.4s -str q29, [x0, #768] -mul v5.4S, v5.4S,v3.s[2] -mul v25.4S, v25.4S,v3.s[1] -sub v29.4s, v28.4s, v23.4s -str q29, [x0, #880] -mla v5.4S, v26.4S, v31.s[0] -mla v25.4S, v30.4S, v31.s[0] -add v28.4s, v28.4s, v23.4s -str q28, [x0, #864] -mul v18.4S, v18.4S,v15.s[2] -mul v10.4S, v10.4S,v15.s[1] -sub v1.4s, v17.4s, v14.4s -str q1, [x0, #848] -mla v18.4S, v24.4S, v31.s[0] -mla v10.4S, v11.4S, v31.s[0] -add v17.4s, v17.4s, v14.4s -str q17, [x0, #832] -sub v8.4s, v20.4s, v5.4s -str q8, [x0, #944] -add v20.4s, v20.4s, v5.4s -str q20, [x0, #928] -sub v20.4s, v7.4s, v25.4s -str q20, [x0, #912] -add v7.4s, v7.4s, v25.4s -str q7, [x0, #896] -sub v7.4s, v21.4s, v18.4s -str q7, [x0, #1008] -add v21.4s, v21.4s, v18.4s -str q21, [x0, #992] -sub v21.4s, v19.4s, v10.4s -str q21, [x0, #976] -add v19.4s, v19.4s, v10.4s -str q19, [x0, #960] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s deleted file mode 100644 index afe097b..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_15.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_15: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #48] -ldr q25, [x0, #32] -ldr q13, [x0, #112] -ldr q26, [x0, #96] -ldr q15, [x17, #+128] -ldr q29, [x17, #+144] -ldr q16, [x17, #+160] -ldr q1, [x17, #+176] -ldr q4, [x0, #176] -ldr q22, [x0, #160] -sqrdmulh v9.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v15.s[0] -ldr q20, [x0, #240] -sqrdmulh v23.4S, v25.4S, v29.s[0] -mul v25.4S, v25.4S,v15.s[0] -ldr q5, [x0, #224] -sqrdmulh v3.4S, v13.4S, v1.s[0] -mul v13.4S, v13.4S,v16.s[0] -ldr q2, [x17, #+192] -sqrdmulh v7.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v16.s[0] -ldr q28, [x17, #+208] -mla v24.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v4.4S, v28.s[0] -ldr q30, [x17, #+224] -mla v25.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v22.4S, v28.s[0] -ldr q6, [x17, #+240] -mla v13.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v20.4S, v6.s[0] -mla v26.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v5.4S, v6.s[0] -ldr q18, [x0, #0] -mul v4.4S, v4.4S,v2.s[0] -mul v22.4S, v22.4S,v2.s[0] -mla v4.4S, v9.4S, v31.s[0] -mla v22.4S, v23.4S, v31.s[0] -ldr q23, [x0, #64] -mul v20.4S, v20.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -sub v9.4s, v10.4s, v24.4s -add v10.4s, v10.4s, v24.4s -mla v20.4S, v3.4S, v31.s[0] -mla v5.4S, v7.4S, v31.s[0] -sub v7.4s, v18.4s, v25.4s -ldr q3, [x0, #128] -add v18.4s, v18.4s, v25.4s -sqrdmulh v25.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v15.s[2] -sub v24.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v10.4S, v29.s[1] -mul v10.4S, v10.4S,v15.s[1] -sub v27.4s, v23.4s, v26.4s -ldr q19, [x0, #192] -add v23.4s, v23.4s, v26.4s -sqrdmulh v29.4S, v24.4S, v1.s[2] -mul v24.4S, v24.4S,v16.s[2] -sub v26.4s, v11.4s, v4.4s -ldr q15, [x0, #304] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v17.4S, v1.s[1] -mul v17.4S, v17.4S,v16.s[1] -sub v8.4s, v3.4s, v22.4s -ldr q21, [x0, #288] -add v3.4s, v3.4s, v22.4s -mla v9.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v26.4S, v28.s[2] -sub v1.4s, v14.4s, v20.4s -ldr q22, [x0, #368] -add v14.4s, v14.4s, v20.4s -mla v10.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v11.4S, v28.s[1] -sub v20.4s, v19.4s, v5.4s -ldr q16, [x0, #352] -add v19.4s, v19.4s, v5.4s -mla v24.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v1.4S, v6.s[2] -sub v5.4s, v7.4s, v9.4s -ldr q12, [x17, #+256] -str q5, [x0, #48] -mla v17.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v14.4S, v6.s[1] -add v7.4s, v7.4s, v9.4s -ldr q9, [x17, #+272] -str q7, [x0, #32] -mul v26.4S, v26.4S,v2.s[2] -mul v11.4S, v11.4S,v2.s[1] -sub v7.4s, v18.4s, v10.4s -ldr q5, [x17, #+288] -str q7, [x0, #16] -mla v26.4S, v25.4S, v31.s[0] -mla v11.4S, v13.4S, v31.s[0] -add v18.4s, v18.4s, v10.4s -ldr q10, [x17, #+304] -str q18, [x0, #0] -mul v1.4S, v1.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[1] -sub v28.4s, v27.4s, v24.4s -ldr q18, [x0, #432] -str q28, [x0, #112] -mla v1.4S, v29.4S, v31.s[0] -mla v14.4S, v4.4S, v31.s[0] -add v27.4s, v27.4s, v24.4s -ldr q24, [x0, #416] -str q27, [x0, #96] -sqrdmulh v6.4S, v15.4S, v9.s[0] -mul v15.4S, v15.4S,v12.s[0] -sub v27.4s, v23.4s, v17.4s -ldr q30, [x0, #496] -str q27, [x0, #80] -sqrdmulh v27.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v12.s[0] -add v23.4s, v23.4s, v17.4s -ldr q17, [x0, #480] -str q23, [x0, #64] -sqrdmulh v23.4S, v22.4S, v10.s[0] -mul v22.4S, v22.4S,v5.s[0] -sub v4.4s, v8.4s, v26.4s -ldr q29, [x17, #+320] -str q4, [x0, #176] -sqrdmulh v4.4S, v16.4S, v10.s[0] -mul v16.4S, v16.4S,v5.s[0] -add v8.4s, v8.4s, v26.4s -ldr q26, [x17, #+336] -str q8, [x0, #160] -mla v15.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v18.4S, v26.s[0] -sub v8.4s, v3.4s, v11.4s -ldr q28, [x17, #+352] -str q8, [x0, #144] -mla v21.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v24.4S, v26.s[0] -add v3.4s, v3.4s, v11.4s -ldr q11, [x17, #+368] -str q3, [x0, #128] -mla v22.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v30.4S, v11.s[0] -sub v3.4s, v20.4s, v1.4s -ldr q8, [x0, #272] -str q3, [x0, #240] -mla v16.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v17.4S, v11.s[0] -add v20.4s, v20.4s, v1.4s -ldr q1, [x0, #256] -str q20, [x0, #224] -mul v18.4S, v18.4S,v29.s[0] -mul v24.4S, v24.4S,v29.s[0] -sub v20.4s, v19.4s, v14.4s -ldr q3, [x0, #336] -str q20, [x0, #208] -mla v18.4S, v6.4S, v31.s[0] -mla v24.4S, v27.4S, v31.s[0] -add v19.4s, v19.4s, v14.4s -ldr q14, [x0, #320] -str q19, [x0, #192] -mul v30.4S, v30.4S,v28.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v19.4s, v8.4s, v15.4s -ldr q27, [x0, #400] -add v8.4s, v8.4s, v15.4s -mla v30.4S, v23.4S, v31.s[0] -mla v17.4S, v4.4S, v31.s[0] -sub v4.4s, v1.4s, v21.4s -ldr q23, [x0, #384] -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v9.s[2] -mul v19.4S, v19.4S,v12.s[2] -sub v15.4s, v3.4s, v22.4s -ldr q6, [x0, #464] -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v12.s[1] -sub v20.4s, v14.4s, v16.4s -ldr q2, [x0, #448] -add v14.4s, v14.4s, v16.4s -sqrdmulh v9.4S, v15.4S, v10.s[2] -mul v15.4S, v15.4S,v5.s[2] -sub v16.4s, v27.4s, v18.4s -ldr q12, [x0, #560] -add v27.4s, v27.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v10.s[1] -mul v3.4S, v3.4S,v5.s[1] -sub v13.4s, v23.4s, v24.4s -ldr q25, [x0, #544] -add v23.4s, v23.4s, v24.4s -mla v19.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v16.4S, v26.s[2] -sub v10.4s, v6.4s, v30.4s -ldr q24, [x0, #624] -add v6.4s, v6.4s, v30.4s -mla v8.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v27.4S, v26.s[1] -sub v30.4s, v2.4s, v17.4s -ldr q5, [x0, #608] -add v2.4s, v2.4s, v17.4s -mla v15.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v10.4S, v11.s[2] -sub v17.4s, v4.4s, v19.4s -ldr q7, [x17, #+384] -str q17, [x0, #304] -mla v3.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v6.4S, v11.s[1] -add v4.4s, v4.4s, v19.4s -ldr q19, [x17, #+400] -str q4, [x0, #288] -mul v16.4S, v16.4S,v29.s[2] -mul v27.4S, v27.4S,v29.s[1] -sub v4.4s, v1.4s, v8.4s -ldr q17, [x17, #+416] -str q4, [x0, #272] -mla v16.4S, v21.4S, v31.s[0] -mla v27.4S, v22.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -ldr q8, [x17, #+432] -str q1, [x0, #256] -mul v10.4S, v10.4S,v28.s[2] -mul v6.4S, v6.4S,v28.s[1] -sub v26.4s, v20.4s, v15.4s -ldr q1, [x0, #688] -str q26, [x0, #368] -mla v10.4S, v9.4S, v31.s[0] -mla v6.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -ldr q15, [x0, #672] -str q20, [x0, #352] -sqrdmulh v11.4S, v12.4S, v19.s[0] -mul v12.4S, v12.4S,v7.s[0] -sub v20.4s, v14.4s, v3.4s -ldr q28, [x0, #752] -str q20, [x0, #336] -sqrdmulh v20.4S, v25.4S, v19.s[0] -mul v25.4S, v25.4S,v7.s[0] -add v14.4s, v14.4s, v3.4s -ldr q3, [x0, #736] -str q14, [x0, #320] -sqrdmulh v14.4S, v24.4S, v8.s[0] -mul v24.4S, v24.4S,v17.s[0] -sub v18.4s, v13.4s, v16.4s -ldr q9, [x17, #+448] -str q18, [x0, #432] -sqrdmulh v18.4S, v5.4S, v8.s[0] -mul v5.4S, v5.4S,v17.s[0] -add v13.4s, v13.4s, v16.4s -ldr q16, [x17, #+464] -str q13, [x0, #416] -mla v12.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v1.4S, v16.s[0] -sub v13.4s, v23.4s, v27.4s -ldr q26, [x17, #+480] -str q13, [x0, #400] -mla v25.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v15.4S, v16.s[0] -add v23.4s, v23.4s, v27.4s -ldr q27, [x17, #+496] -str q23, [x0, #384] -mla v24.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v28.4S, v27.s[0] -sub v23.4s, v30.4s, v10.4s -ldr q13, [x0, #528] -str q23, [x0, #496] -mla v5.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v3.4S, v27.s[0] -add v30.4s, v30.4s, v10.4s -ldr q10, [x0, #512] -str q30, [x0, #480] -mul v1.4S, v1.4S,v9.s[0] -mul v15.4S, v15.4S,v9.s[0] -sub v30.4s, v2.4s, v6.4s -ldr q23, [x0, #592] -str q30, [x0, #464] -mla v1.4S, v11.4S, v31.s[0] -mla v15.4S, v20.4S, v31.s[0] -add v2.4s, v2.4s, v6.4s -ldr q6, [x0, #576] -str q2, [x0, #448] -mul v28.4S, v28.4S,v26.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v2.4s, v13.4s, v12.4s -ldr q20, [x0, #656] -add v13.4s, v13.4s, v12.4s -mla v28.4S, v14.4S, v31.s[0] -mla v3.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v25.4s -ldr q14, [x0, #640] -add v10.4s, v10.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v19.s[2] -mul v2.4S, v2.4S,v7.s[2] -sub v12.4s, v23.4s, v24.4s -ldr q11, [x0, #720] -add v23.4s, v23.4s, v24.4s -sqrdmulh v24.4S, v13.4S, v19.s[1] -mul v13.4S, v13.4S,v7.s[1] -sub v30.4s, v6.4s, v5.4s -ldr q29, [x0, #704] -add v6.4s, v6.4s, v5.4s -sqrdmulh v19.4S, v12.4S, v8.s[2] -mul v12.4S, v12.4S,v17.s[2] -sub v5.4s, v20.4s, v1.4s -ldr q7, [x0, #816] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v23.4S, v8.s[1] -mul v23.4S, v23.4S,v17.s[1] -sub v22.4s, v14.4s, v15.4s -ldr q21, [x0, #800] -add v14.4s, v14.4s, v15.4s -mla v2.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v5.4S, v16.s[2] -sub v8.4s, v11.4s, v28.4s -ldr q15, [x0, #880] -add v11.4s, v11.4s, v28.4s -mla v13.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v20.4S, v16.s[1] -sub v28.4s, v29.4s, v3.4s -ldr q17, [x0, #864] -add v29.4s, v29.4s, v3.4s -mla v12.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v8.4S, v27.s[2] -sub v3.4s, v18.4s, v2.4s -ldr q4, [x17, #+512] -str q3, [x0, #560] -mla v23.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v11.4S, v27.s[1] -add v18.4s, v18.4s, v2.4s -ldr q2, [x17, #+528] -str q18, [x0, #544] -mul v5.4S, v5.4S,v9.s[2] -mul v20.4S, v20.4S,v9.s[1] -sub v18.4s, v10.4s, v13.4s -ldr q3, [x17, #+544] -str q18, [x0, #528] -mla v5.4S, v25.4S, v31.s[0] -mla v20.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v13.4s -ldr q13, [x17, #+560] -str q10, [x0, #512] -mul v8.4S, v8.4S,v26.s[2] -mul v11.4S, v11.4S,v26.s[1] -sub v16.4s, v30.4s, v12.4s -ldr q10, [x0, #944] -str q16, [x0, #624] -mla v8.4S, v19.4S, v31.s[0] -mla v11.4S, v1.4S, v31.s[0] -add v30.4s, v30.4s, v12.4s -ldr q12, [x0, #928] -str q30, [x0, #608] -sqrdmulh v27.4S, v7.4S, v2.s[0] -mul v7.4S, v7.4S,v4.s[0] -sub v30.4s, v6.4s, v23.4s -ldr q26, [x0, #1008] -str q30, [x0, #592] -sqrdmulh v30.4S, v21.4S, v2.s[0] -mul v21.4S, v21.4S,v4.s[0] -add v6.4s, v6.4s, v23.4s -ldr q23, [x0, #992] -str q6, [x0, #576] -sqrdmulh v6.4S, v15.4S, v13.s[0] -mul v15.4S, v15.4S,v3.s[0] -sub v1.4s, v22.4s, v5.4s -ldr q19, [x17, #+576] -str q1, [x0, #688] -sqrdmulh v1.4S, v17.4S, v13.s[0] -mul v17.4S, v17.4S,v3.s[0] -add v22.4s, v22.4s, v5.4s -ldr q5, [x17, #+592] -str q22, [x0, #672] -mla v7.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v10.4S, v5.s[0] -sub v22.4s, v14.4s, v20.4s -ldr q16, [x17, #+608] -str q22, [x0, #656] -mla v21.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v12.4S, v5.s[0] -add v14.4s, v14.4s, v20.4s -ldr q20, [x17, #+624] -str q14, [x0, #640] -mla v15.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v26.4S, v20.s[0] -sub v14.4s, v28.4s, v8.4s -ldr q22, [x0, #784] -str q14, [x0, #752] -mla v17.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v23.4S, v20.s[0] -add v28.4s, v28.4s, v8.4s -ldr q8, [x0, #768] -str q28, [x0, #736] -mul v10.4S, v10.4S,v19.s[0] -mul v12.4S, v12.4S,v19.s[0] -sub v28.4s, v29.4s, v11.4s -ldr q14, [x0, #848] -str q28, [x0, #720] -mla v10.4S, v27.4S, v31.s[0] -mla v12.4S, v30.4S, v31.s[0] -add v29.4s, v29.4s, v11.4s -ldr q11, [x0, #832] -str q29, [x0, #704] -mul v26.4S, v26.4S,v16.s[0] -mul v23.4S, v23.4S,v16.s[0] -sub v29.4s, v22.4s, v7.4s -ldr q30, [x0, #912] -add v22.4s, v22.4s, v7.4s -mla v26.4S, v6.4S, v31.s[0] -mla v23.4S, v1.4S, v31.s[0] -sub v1.4s, v8.4s, v21.4s -ldr q6, [x0, #896] -add v8.4s, v8.4s, v21.4s -sqrdmulh v21.4S, v29.4S, v2.s[2] -mul v29.4S, v29.4S,v4.s[2] -sub v7.4s, v14.4s, v15.4s -ldr q27, [x0, #976] -add v14.4s, v14.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v2.s[1] -mul v22.4S, v22.4S,v4.s[1] -sub v28.4s, v11.4s, v17.4s -ldr q9, [x0, #960] -add v11.4s, v11.4s, v17.4s -sqrdmulh v2.4S, v7.4S, v13.s[2] -mul v7.4S, v7.4S,v3.s[2] -sub v17.4s, v30.4s, v10.4s -add v30.4s, v30.4s, v10.4s -sqrdmulh v10.4S, v14.4S, v13.s[1] -mul v14.4S, v14.4S,v3.s[1] -sub v4.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -mla v29.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v5.s[2] -sub v13.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -mla v22.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v30.4S, v5.s[1] -sub v26.4s, v9.4s, v23.4s -add v9.4s, v9.4s, v23.4s -mla v7.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v13.4S, v20.s[2] -sub v23.4s, v1.4s, v29.4s -str q23, [x0, #816] -mla v14.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v27.4S, v20.s[1] -add v1.4s, v1.4s, v29.4s -str q1, [x0, #800] -mul v17.4S, v17.4S,v19.s[2] -mul v30.4S, v30.4S,v19.s[1] -sub v1.4s, v8.4s, v22.4s -str q1, [x0, #784] -mla v17.4S, v21.4S, v31.s[0] -mla v30.4S, v15.4S, v31.s[0] -add v8.4s, v8.4s, v22.4s -str q8, [x0, #768] -mul v13.4S, v13.4S,v16.s[2] -mul v27.4S, v27.4S,v16.s[1] -sub v5.4s, v28.4s, v7.4s -str q5, [x0, #880] -mla v13.4S, v2.4S, v31.s[0] -mla v27.4S, v10.4S, v31.s[0] -add v28.4s, v28.4s, v7.4s -str q28, [x0, #864] -sub v20.4s, v11.4s, v14.4s -str q20, [x0, #848] -add v11.4s, v11.4s, v14.4s -str q11, [x0, #832] -sub v11.4s, v4.4s, v17.4s -str q11, [x0, #944] -add v4.4s, v4.4s, v17.4s -str q4, [x0, #928] -sub v4.4s, v6.4s, v30.4s -str q4, [x0, #912] -add v6.4s, v6.4s, v30.4s -str q6, [x0, #896] -sub v6.4s, v26.4s, v13.4s -str q6, [x0, #1008] -add v26.4s, v26.4s, v13.4s -str q26, [x0, #992] -sub v26.4s, v9.4s, v27.4s -str q26, [x0, #976] -add v9.4s, v9.4s, v27.4s -str q9, [x0, #960] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s deleted file mode 100644 index e7dd93d..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_7.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #224] -ldr q25, [x0, #160] -ldr q13, [x0, #32] -ldr q26, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v29.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v26.s[0] -ldr q16, [x0, #48] -sqrdmulh v1.4S, v16.4S, v15.s[0] -mul v16.4S, v16.4S,v26.s[0] -ldr q4, [x17, #+160] -ldr q22, [x17, #+176] -ldr q9, [x0, #96] -sqrdmulh v20.4S, v9.4S, v22.s[0] -mul v9.4S, v9.4S,v4.s[0] -ldr q23, [x0, #112] -sqrdmulh v5.4S, v23.4S, v22.s[0] -mul v23.4S, v23.4S,v4.s[0] -ldr q3, [x17, #+192] -ldr q2, [x17, #+208] -mla v13.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v25.4S, v2.s[0] -ldr q7, [x0, #176] -mla v16.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v7.4S, v2.s[0] -ldr q28, [x17, #+224] -ldr q30, [x17, #+240] -mla v9.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v24.4S, v30.s[0] -ldr q6, [x0, #240] -mla v23.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v6.4S, v30.s[0] -ldr q18, [x0, #0] -ldr q27, [x0, #128] -mul v25.4S, v25.4S,v3.s[0] -sub v19.4s, v18.4s, v13.4s -mul v7.4S, v7.4S,v3.s[0] -add v18.4s, v18.4s, v13.4s -mla v25.4S, v29.4S, v31.s[0] -sub v29.4s, v10.4s, v16.4s -ldr q13, [x0, #64] -mla v7.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v16.4s -ldr q16, [x0, #192] -mul v24.4S, v24.4S,v28.s[0] -sub v1.4s, v13.4s, v9.4s -mul v6.4S, v6.4S,v28.s[0] -add v13.4s, v13.4s, v9.4s -mla v24.4S, v20.4S, v31.s[0] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v17.4s, v23.4s -sqrdmulh v20.4S, v10.4S, v15.s[1] -add v17.4s, v17.4s, v23.4s -mul v10.4S, v10.4S,v26.s[1] -sqrdmulh v23.4S, v29.4S, v15.s[2] -sub v9.4s, v27.4s, v25.4s -mul v29.4S, v29.4S,v26.s[2] -add v27.4s, v27.4s, v25.4s -sqrdmulh v15.4S, v17.4S, v22.s[1] -sub v26.4s, v11.4s, v7.4s -mul v17.4S, v17.4S,v4.s[1] -add v11.4s, v11.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v22.s[2] -sub v25.4s, v16.4s, v24.4s -mul v5.4S, v5.4S,v4.s[2] -add v16.4s, v16.4s, v24.4s -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v14.4s, v6.4s -ldr q22, [x0, #480] -sqrdmulh v4.4S, v11.4S, v2.s[1] -add v14.4s, v14.4s, v6.4s -mla v29.4S, v23.4S, v31.s[0] -ldr q23, [x0, #416] -sqrdmulh v6.4S, v26.4S, v2.s[2] -sub v24.4s, v18.4s, v10.4s -mla v17.4S, v15.4S, v31.s[0] -ldr q15, [x0, #288] -sqrdmulh v8.4S, v14.4S, v30.s[1] -add v18.4s, v18.4s, v10.4s -str q24, [x0, #16] -mla v5.4S, v7.4S, v31.s[0] -ldr q7, [x17, #+256] -ldr q24, [x17, #+272] -sqrdmulh v10.4S, v20.4S, v30.s[2] -sub v21.4s, v19.4s, v29.4s -str q18, [x0, #0] -mul v11.4S, v11.4S,v3.s[1] -add v19.4s, v19.4s, v29.4s -mul v26.4S, v26.4S,v3.s[2] -str q21, [x0, #48] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v17.4s -mla v26.4S, v6.4S, v31.s[0] -str q19, [x0, #32] -mul v14.4S, v14.4S,v28.s[1] -str q4, [x0, #80] -mul v20.4S, v20.4S,v28.s[2] -add v13.4s, v13.4s, v17.4s -str q13, [x0, #64] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v5.4s -str q8, [x0, #112] -mla v20.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v5.4s -str q1, [x0, #96] -sqrdmulh v30.4S, v15.4S, v24.s[0] -sub v28.4s, v27.4s, v11.4s -mul v15.4S, v15.4S,v7.s[0] -str q28, [x0, #144] -ldr q28, [x0, #304] -sqrdmulh v1.4S, v28.4S, v24.s[0] -add v27.4s, v27.4s, v11.4s -mul v28.4S, v28.4S,v7.s[0] -str q27, [x0, #128] -ldr q27, [x17, #+288] -ldr q11, [x17, #+304] -ldr q5, [x0, #352] -sqrdmulh v10.4S, v5.4S, v11.s[0] -sub v8.4s, v9.4s, v26.4s -mul v5.4S, v5.4S,v27.s[0] -str q8, [x0, #176] -ldr q8, [x0, #368] -sqrdmulh v13.4S, v8.4S, v11.s[0] -add v9.4s, v9.4s, v26.4s -mul v8.4S, v8.4S,v27.s[0] -str q9, [x0, #160] -ldr q9, [x17, #+320] -ldr q26, [x17, #+336] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v16.4s, v14.4s -sqrdmulh v17.4S, v23.4S, v26.s[0] -str q30, [x0, #208] -ldr q30, [x0, #432] -mla v28.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v30.4S, v26.s[0] -str q16, [x0, #192] -ldr q16, [x17, #+352] -ldr q1, [x17, #+368] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v25.4s, v20.4s -sqrdmulh v4.4S, v22.4S, v1.s[0] -str q10, [x0, #240] -ldr q10, [x0, #496] -mla v8.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v1.s[0] -str q25, [x0, #224] -ldr q25, [x0, #256] -ldr q13, [x0, #384] -mul v23.4S, v23.4S,v9.s[0] -sub v2.4s, v25.4s, v15.4s -ldr q3, [x0, #272] -mul v30.4S, v30.4S,v9.s[0] -add v25.4s, v25.4s, v15.4s -ldr q15, [x0, #400] -mla v23.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v28.4s -ldr q19, [x0, #320] -mla v30.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -ldr q28, [x0, #448] -mul v22.4S, v22.4S,v16.s[0] -sub v14.4s, v19.4s, v5.4s -ldr q6, [x0, #336] -mul v10.4S, v10.4S,v16.s[0] -add v19.4s, v19.4s, v5.4s -ldr q5, [x0, #464] -mla v22.4S, v4.4S, v31.s[0] -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v8.4s -sqrdmulh v4.4S, v3.4S, v24.s[1] -add v6.4s, v6.4s, v8.4s -mul v3.4S, v3.4S,v7.s[1] -sqrdmulh v8.4S, v17.4S, v24.s[2] -sub v21.4s, v13.4s, v23.4s -mul v17.4S, v17.4S,v7.s[2] -add v13.4s, v13.4s, v23.4s -sqrdmulh v24.4S, v6.4S, v11.s[1] -sub v7.4s, v15.4s, v30.4s -mul v6.4S, v6.4S,v27.s[1] -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v20.4S, v11.s[2] -sub v23.4s, v28.4s, v22.4s -mul v20.4S, v20.4S,v27.s[2] -add v28.4s, v28.4s, v22.4s -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v10.4s -ldr q11, [x0, #736] -sqrdmulh v27.4S, v15.4S, v26.s[1] -add v5.4s, v5.4s, v10.4s -mla v17.4S, v8.4S, v31.s[0] -ldr q8, [x0, #672] -sqrdmulh v10.4S, v7.4S, v26.s[2] -sub v22.4s, v25.4s, v3.4s -mla v6.4S, v24.4S, v31.s[0] -ldr q24, [x0, #544] -sqrdmulh v29.4S, v5.4S, v1.s[1] -add v25.4s, v25.4s, v3.4s -str q22, [x0, #272] -mla v20.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+384] -ldr q22, [x17, #+400] -sqrdmulh v3.4S, v4.4S, v1.s[2] -sub v18.4s, v2.4s, v17.4s -str q25, [x0, #256] -mul v15.4S, v15.4S,v9.s[1] -add v2.4s, v2.4s, v17.4s -mul v7.4S, v7.4S,v9.s[2] -str q18, [x0, #304] -mla v15.4S, v27.4S, v31.s[0] -sub v27.4s, v19.4s, v6.4s -mla v7.4S, v10.4S, v31.s[0] -str q2, [x0, #288] -mul v5.4S, v5.4S,v16.s[1] -str q27, [x0, #336] -mul v4.4S, v4.4S,v16.s[2] -add v19.4s, v19.4s, v6.4s -str q19, [x0, #320] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v14.4s, v20.4s -str q29, [x0, #368] -mla v4.4S, v3.4S, v31.s[0] -add v14.4s, v14.4s, v20.4s -str q14, [x0, #352] -sqrdmulh v1.4S, v24.4S, v22.s[0] -sub v16.4s, v13.4s, v15.4s -mul v24.4S, v24.4S,v30.s[0] -str q16, [x0, #400] -ldr q16, [x0, #560] -sqrdmulh v14.4S, v16.4S, v22.s[0] -add v13.4s, v13.4s, v15.4s -mul v16.4S, v16.4S,v30.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+416] -ldr q15, [x17, #+432] -ldr q20, [x0, #608] -sqrdmulh v3.4S, v20.4S, v15.s[0] -sub v29.4s, v21.4s, v7.4s -mul v20.4S, v20.4S,v13.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v19.4S, v29.4S, v15.s[0] -add v21.4s, v21.4s, v7.4s -mul v29.4S, v29.4S,v13.s[0] -str q21, [x0, #416] -ldr q21, [x17, #+448] -ldr q7, [x17, #+464] -mla v24.4S, v1.4S, v31.s[0] -sub v1.4s, v28.4s, v5.4s -sqrdmulh v6.4S, v8.4S, v7.s[0] -str q1, [x0, #464] -ldr q1, [x0, #688] -mla v16.4S, v14.4S, v31.s[0] -add v28.4s, v28.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v7.s[0] -str q28, [x0, #448] -ldr q28, [x17, #+480] -ldr q14, [x17, #+496] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v23.4s, v4.4s -sqrdmulh v27.4S, v11.4S, v14.s[0] -str q3, [x0, #496] -ldr q3, [x0, #752] -mla v29.4S, v19.4S, v31.s[0] -add v23.4s, v23.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v14.s[0] -str q23, [x0, #480] -ldr q23, [x0, #512] -ldr q19, [x0, #640] -mul v8.4S, v8.4S,v21.s[0] -sub v26.4s, v23.4s, v24.4s -ldr q9, [x0, #528] -mul v1.4S, v1.4S,v21.s[0] -add v23.4s, v23.4s, v24.4s -ldr q24, [x0, #656] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v16.4s -ldr q2, [x0, #576] -mla v1.4S, v5.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -ldr q16, [x0, #704] -mul v11.4S, v11.4S,v28.s[0] -sub v5.4s, v2.4s, v20.4s -ldr q10, [x0, #592] -mul v3.4S, v3.4S,v28.s[0] -add v2.4s, v2.4s, v20.4s -ldr q20, [x0, #720] -mla v11.4S, v27.4S, v31.s[0] -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v29.4s -sqrdmulh v27.4S, v9.4S, v22.s[1] -add v10.4s, v10.4s, v29.4s -mul v9.4S, v9.4S,v30.s[1] -sqrdmulh v29.4S, v6.4S, v22.s[2] -sub v18.4s, v19.4s, v8.4s -mul v6.4S, v6.4S,v30.s[2] -add v19.4s, v19.4s, v8.4s -sqrdmulh v22.4S, v10.4S, v15.s[1] -sub v30.4s, v24.4s, v1.4s -mul v10.4S, v10.4S,v13.s[1] -add v24.4s, v24.4s, v1.4s -sqrdmulh v1.4S, v4.4S, v15.s[2] -sub v8.4s, v16.4s, v11.4s -mul v4.4S, v4.4S,v13.s[2] -add v16.4s, v16.4s, v11.4s -mla v9.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v3.4s -ldr q15, [x0, #992] -sqrdmulh v13.4S, v24.4S, v7.s[1] -add v20.4s, v20.4s, v3.4s -mla v6.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v3.4S, v30.4S, v7.s[2] -sub v11.4s, v23.4s, v9.4s -mla v10.4S, v22.4S, v31.s[0] -ldr q22, [x0, #800] -sqrdmulh v17.4S, v20.4S, v14.s[1] -add v23.4s, v23.4s, v9.4s -str q11, [x0, #528] -mla v4.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+512] -ldr q11, [x17, #+528] -sqrdmulh v9.4S, v27.4S, v14.s[2] -sub v25.4s, v26.4s, v6.4s -str q23, [x0, #512] -mul v24.4S, v24.4S,v21.s[1] -add v26.4s, v26.4s, v6.4s -mul v30.4S, v30.4S,v21.s[2] -str q25, [x0, #560] -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v2.4s, v10.4s -mla v30.4S, v3.4S, v31.s[0] -str q26, [x0, #544] -mul v20.4S, v20.4S,v28.s[1] -str q13, [x0, #592] -mul v27.4S, v27.4S,v28.s[2] -add v2.4s, v2.4s, v10.4s -str q2, [x0, #576] -mla v20.4S, v17.4S, v31.s[0] -sub v17.4s, v5.4s, v4.4s -str q17, [x0, #624] -mla v27.4S, v9.4S, v31.s[0] -add v5.4s, v5.4s, v4.4s -str q5, [x0, #608] -sqrdmulh v14.4S, v22.4S, v11.s[0] -sub v28.4s, v19.4s, v24.4s -mul v22.4S, v22.4S,v1.s[0] -str q28, [x0, #656] -ldr q28, [x0, #816] -sqrdmulh v5.4S, v28.4S, v11.s[0] -add v19.4s, v19.4s, v24.4s -mul v28.4S, v28.4S,v1.s[0] -str q19, [x0, #640] -ldr q19, [x17, #+544] -ldr q24, [x17, #+560] -ldr q4, [x0, #864] -sqrdmulh v9.4S, v4.4S, v24.s[0] -sub v17.4s, v18.4s, v30.4s -mul v4.4S, v4.4S,v19.s[0] -str q17, [x0, #688] -ldr q17, [x0, #880] -sqrdmulh v2.4S, v17.4S, v24.s[0] -add v18.4s, v18.4s, v30.4s -mul v17.4S, v17.4S,v19.s[0] -str q18, [x0, #672] -ldr q18, [x17, #+576] -ldr q30, [x17, #+592] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v20.4s -sqrdmulh v10.4S, v29.4S, v30.s[0] -str q14, [x0, #720] -ldr q14, [x0, #944] -mla v28.4S, v5.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v30.s[0] -str q16, [x0, #704] -ldr q16, [x17, #+608] -ldr q5, [x17, #+624] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v27.4s -sqrdmulh v13.4S, v15.4S, v5.s[0] -str q9, [x0, #752] -ldr q9, [x0, #1008] -mla v17.4S, v2.4S, v31.s[0] -add v8.4s, v8.4s, v27.4s -sqrdmulh v27.4S, v9.4S, v5.s[0] -str q8, [x0, #736] -ldr q8, [x0, #768] -ldr q2, [x0, #896] -mul v29.4S, v29.4S,v18.s[0] -sub v7.4s, v8.4s, v22.4s -ldr q21, [x0, #784] -mul v14.4S, v14.4S,v18.s[0] -add v8.4s, v8.4s, v22.4s -ldr q22, [x0, #912] -mla v29.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v28.4s -ldr q26, [x0, #832] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v28.4s -ldr q28, [x0, #960] -mul v15.4S, v15.4S,v16.s[0] -sub v20.4s, v26.4s, v4.4s -ldr q3, [x0, #848] -mul v9.4S, v9.4S,v16.s[0] -add v26.4s, v26.4s, v4.4s -ldr q4, [x0, #976] -mla v15.4S, v13.4S, v31.s[0] -mla v9.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v17.4s -sqrdmulh v13.4S, v21.4S, v11.s[1] -add v3.4s, v3.4s, v17.4s -mul v21.4S, v21.4S,v1.s[1] -sqrdmulh v17.4S, v10.4S, v11.s[2] -sub v25.4s, v2.4s, v29.4s -mul v10.4S, v10.4S,v1.s[2] -add v2.4s, v2.4s, v29.4s -sqrdmulh v11.4S, v3.4S, v24.s[1] -sub v1.4s, v22.4s, v14.4s -mul v3.4S, v3.4S,v19.s[1] -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v27.4S, v24.s[2] -sub v29.4s, v28.4s, v15.4s -mul v27.4S, v27.4S,v19.s[2] -add v28.4s, v28.4s, v15.4s -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v9.4s -sqrdmulh v24.4S, v22.4S, v30.s[1] -add v4.4s, v4.4s, v9.4s -mla v10.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v1.4S, v30.s[2] -sub v9.4s, v8.4s, v21.4s -mla v3.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v4.4S, v5.s[1] -add v8.4s, v8.4s, v21.4s -str q9, [x0, #784] -mla v27.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v13.4S, v5.s[2] -sub v9.4s, v7.4s, v10.4s -str q8, [x0, #768] -mul v22.4S, v22.4S,v18.s[1] -add v7.4s, v7.4s, v10.4s -mul v1.4S, v1.4S,v18.s[2] -str q9, [x0, #816] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v3.4s -mla v1.4S, v17.4S, v31.s[0] -str q7, [x0, #800] -mul v4.4S, v4.4S,v16.s[1] -str q24, [x0, #848] -mul v13.4S, v13.4S,v16.s[2] -add v26.4s, v26.4s, v3.4s -str q26, [x0, #832] -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v27.4s -str q11, [x0, #880] -mla v13.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v27.4s -str q20, [x0, #864] -sub v5.4s, v2.4s, v22.4s -str q5, [x0, #912] -add v2.4s, v2.4s, v22.4s -str q2, [x0, #896] -sub v2.4s, v25.4s, v1.4s -str q2, [x0, #944] -add v25.4s, v25.4s, v1.4s -str q25, [x0, #928] -sub v25.4s, v28.4s, v4.4s -str q25, [x0, #976] -add v28.4s, v28.4s, v4.4s -str q28, [x0, #960] -sub v28.4s, v29.4s, v13.4s -str q28, [x0, #1008] -add v29.4s, v29.4s, v13.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s deleted file mode 100644 index a3ac527..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_8.s +++ /dev/null @@ -1,1550 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_8: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #224] -ldr q25, [x0, #160] -ldr q13, [x0, #32] -ldr q26, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v29.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v26.s[0] -ldr q16, [x0, #48] -sqrdmulh v1.4S, v16.4S, v15.s[0] -ldr q4, [x17, #+160] -mul v16.4S, v16.4S,v26.s[0] -ldr q22, [x17, #+176] -ldr q9, [x0, #96] -sqrdmulh v20.4S, v9.4S, v22.s[0] -mul v9.4S, v9.4S,v4.s[0] -ldr q23, [x0, #112] -sqrdmulh v5.4S, v23.4S, v22.s[0] -mul v23.4S, v23.4S,v4.s[0] -ldr q3, [x17, #+192] -mla v13.4S, v29.4S, v31.s[0] -ldr q29, [x17, #+208] -sqrdmulh v2.4S, v25.4S, v29.s[0] -ldr q7, [x0, #176] -mla v16.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v7.4S, v29.s[0] -ldr q28, [x17, #+224] -mla v9.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+240] -sqrdmulh v30.4S, v24.4S, v20.s[0] -ldr q6, [x0, #240] -mla v23.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v6.4S, v20.s[0] -ldr q18, [x0, #0] -ldr q27, [x0, #128] -mul v25.4S, v25.4S,v3.s[0] -sub v19.4s, v18.4s, v13.4s -mul v7.4S, v7.4S,v3.s[0] -add v18.4s, v18.4s, v13.4s -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v16.4s -ldr q13, [x0, #64] -mla v7.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v16.4s -ldr q16, [x0, #192] -mul v24.4S, v24.4S,v28.s[0] -sub v1.4s, v13.4s, v9.4s -mul v6.4S, v6.4S,v28.s[0] -add v13.4s, v13.4s, v9.4s -mla v24.4S, v30.4S, v31.s[0] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v17.4s, v23.4s -sqrdmulh v30.4S, v10.4S, v15.s[1] -add v17.4s, v17.4s, v23.4s -mul v10.4S, v10.4S,v26.s[1] -sqrdmulh v23.4S, v2.4S, v15.s[2] -sub v9.4s, v27.4s, v25.4s -mul v2.4S, v2.4S,v26.s[2] -add v27.4s, v27.4s, v25.4s -sqrdmulh v15.4S, v17.4S, v22.s[1] -sub v26.4s, v11.4s, v7.4s -mul v17.4S, v17.4S,v4.s[1] -add v11.4s, v11.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v22.s[2] -sub v25.4s, v16.4s, v24.4s -mul v5.4S, v5.4S,v4.s[2] -add v16.4s, v16.4s, v24.4s -mla v10.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v6.4s -ldr q22, [x0, #480] -sqrdmulh v4.4S, v11.4S, v29.s[1] -add v14.4s, v14.4s, v6.4s -mla v2.4S, v23.4S, v31.s[0] -ldr q23, [x0, #416] -sqrdmulh v6.4S, v26.4S, v29.s[2] -sub v24.4s, v18.4s, v10.4s -mla v17.4S, v15.4S, v31.s[0] -ldr q15, [x0, #288] -sqrdmulh v8.4S, v14.4S, v20.s[1] -add v18.4s, v18.4s, v10.4s -str q24, [x0, #16] -mla v5.4S, v7.4S, v31.s[0] -ldr q7, [x17, #+256] -sqrdmulh v24.4S, v30.4S, v20.s[2] -sub v10.4s, v19.4s, v2.4s -str q18, [x0, #0] -mul v11.4S, v11.4S,v3.s[1] -add v19.4s, v19.4s, v2.4s -ldr q2, [x17, #+272] -mul v26.4S, v26.4S,v3.s[2] -str q10, [x0, #48] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v17.4s -mla v26.4S, v6.4S, v31.s[0] -str q19, [x0, #32] -mul v14.4S, v14.4S,v28.s[1] -str q4, [x0, #80] -mul v30.4S, v30.4S,v28.s[2] -add v13.4s, v13.4s, v17.4s -str q13, [x0, #64] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v5.4s -str q8, [x0, #112] -mla v30.4S, v24.4S, v31.s[0] -add v1.4s, v1.4s, v5.4s -str q1, [x0, #96] -sqrdmulh v20.4S, v15.4S, v2.s[0] -sub v28.4s, v27.4s, v11.4s -mul v15.4S, v15.4S,v7.s[0] -str q28, [x0, #144] -ldr q28, [x0, #304] -sqrdmulh v1.4S, v28.4S, v2.s[0] -add v27.4s, v27.4s, v11.4s -ldr q11, [x17, #+288] -mul v28.4S, v28.4S,v7.s[0] -str q27, [x0, #128] -ldr q27, [x17, #+304] -ldr q5, [x0, #352] -sqrdmulh v24.4S, v5.4S, v27.s[0] -sub v8.4s, v9.4s, v26.4s -mul v5.4S, v5.4S,v11.s[0] -str q8, [x0, #176] -ldr q8, [x0, #368] -sqrdmulh v13.4S, v8.4S, v27.s[0] -add v9.4s, v9.4s, v26.4s -mul v8.4S, v8.4S,v11.s[0] -str q9, [x0, #160] -ldr q9, [x17, #+320] -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v14.4s -ldr q26, [x17, #+336] -sqrdmulh v17.4S, v23.4S, v26.s[0] -str q20, [x0, #208] -ldr q20, [x0, #432] -mla v28.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v20.4S, v26.s[0] -str q16, [x0, #192] -ldr q16, [x17, #+352] -mla v5.4S, v24.4S, v31.s[0] -sub v24.4s, v25.4s, v30.4s -ldr q1, [x17, #+368] -sqrdmulh v4.4S, v22.4S, v1.s[0] -str q24, [x0, #240] -ldr q24, [x0, #496] -mla v8.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v30.4s -sqrdmulh v30.4S, v24.4S, v1.s[0] -str q25, [x0, #224] -ldr q25, [x0, #256] -ldr q13, [x0, #384] -mul v23.4S, v23.4S,v9.s[0] -sub v29.4s, v25.4s, v15.4s -ldr q3, [x0, #272] -mul v20.4S, v20.4S,v9.s[0] -add v25.4s, v25.4s, v15.4s -ldr q15, [x0, #400] -mla v23.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v28.4s -ldr q19, [x0, #320] -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -ldr q28, [x0, #448] -mul v22.4S, v22.4S,v16.s[0] -sub v14.4s, v19.4s, v5.4s -ldr q6, [x0, #336] -mul v24.4S, v24.4S,v16.s[0] -add v19.4s, v19.4s, v5.4s -ldr q5, [x0, #464] -mla v22.4S, v4.4S, v31.s[0] -mla v24.4S, v30.4S, v31.s[0] -sub v30.4s, v6.4s, v8.4s -sqrdmulh v4.4S, v3.4S, v2.s[1] -add v6.4s, v6.4s, v8.4s -mul v3.4S, v3.4S,v7.s[1] -sqrdmulh v8.4S, v17.4S, v2.s[2] -sub v10.4s, v13.4s, v23.4s -mul v17.4S, v17.4S,v7.s[2] -add v13.4s, v13.4s, v23.4s -sqrdmulh v2.4S, v6.4S, v27.s[1] -sub v7.4s, v15.4s, v20.4s -mul v6.4S, v6.4S,v11.s[1] -add v15.4s, v15.4s, v20.4s -sqrdmulh v20.4S, v30.4S, v27.s[2] -sub v23.4s, v28.4s, v22.4s -mul v30.4S, v30.4S,v11.s[2] -add v28.4s, v28.4s, v22.4s -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v24.4s -ldr q27, [x0, #736] -sqrdmulh v11.4S, v15.4S, v26.s[1] -add v5.4s, v5.4s, v24.4s -mla v17.4S, v8.4S, v31.s[0] -ldr q8, [x0, #672] -sqrdmulh v24.4S, v7.4S, v26.s[2] -sub v22.4s, v25.4s, v3.4s -mla v6.4S, v2.4S, v31.s[0] -ldr q2, [x0, #544] -sqrdmulh v18.4S, v5.4S, v1.s[1] -add v25.4s, v25.4s, v3.4s -str q22, [x0, #272] -mla v30.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+384] -sqrdmulh v22.4S, v4.4S, v1.s[2] -sub v3.4s, v29.4s, v17.4s -str q25, [x0, #256] -mul v15.4S, v15.4S,v9.s[1] -add v29.4s, v29.4s, v17.4s -ldr q17, [x17, #+400] -mul v7.4S, v7.4S,v9.s[2] -str q3, [x0, #304] -mla v15.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v6.4s -mla v7.4S, v24.4S, v31.s[0] -str q29, [x0, #288] -mul v5.4S, v5.4S,v16.s[1] -str q11, [x0, #336] -mul v4.4S, v4.4S,v16.s[2] -add v19.4s, v19.4s, v6.4s -str q19, [x0, #320] -mla v5.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v30.4s -str q18, [x0, #368] -mla v4.4S, v22.4S, v31.s[0] -add v14.4s, v14.4s, v30.4s -str q14, [x0, #352] -sqrdmulh v1.4S, v2.4S, v17.s[0] -sub v16.4s, v13.4s, v15.4s -mul v2.4S, v2.4S,v20.s[0] -str q16, [x0, #400] -ldr q16, [x0, #560] -sqrdmulh v14.4S, v16.4S, v17.s[0] -add v13.4s, v13.4s, v15.4s -ldr q15, [x17, #+416] -mul v16.4S, v16.4S,v20.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+432] -ldr q30, [x0, #608] -sqrdmulh v22.4S, v30.4S, v13.s[0] -sub v18.4s, v10.4s, v7.4s -mul v30.4S, v30.4S,v15.s[0] -str q18, [x0, #432] -ldr q18, [x0, #624] -sqrdmulh v19.4S, v18.4S, v13.s[0] -add v10.4s, v10.4s, v7.4s -mul v18.4S, v18.4S,v15.s[0] -str q10, [x0, #416] -ldr q10, [x17, #+448] -mla v2.4S, v1.4S, v31.s[0] -sub v1.4s, v28.4s, v5.4s -ldr q7, [x17, #+464] -sqrdmulh v6.4S, v8.4S, v7.s[0] -str q1, [x0, #464] -ldr q1, [x0, #688] -mla v16.4S, v14.4S, v31.s[0] -add v28.4s, v28.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v7.s[0] -str q28, [x0, #448] -ldr q28, [x17, #+480] -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v23.4s, v4.4s -ldr q14, [x17, #+496] -sqrdmulh v11.4S, v27.4S, v14.s[0] -str q22, [x0, #496] -ldr q22, [x0, #752] -mla v18.4S, v19.4S, v31.s[0] -add v23.4s, v23.4s, v4.4s -sqrdmulh v4.4S, v22.4S, v14.s[0] -str q23, [x0, #480] -ldr q23, [x0, #512] -ldr q19, [x0, #640] -mul v8.4S, v8.4S,v10.s[0] -sub v26.4s, v23.4s, v2.4s -ldr q9, [x0, #528] -mul v1.4S, v1.4S,v10.s[0] -add v23.4s, v23.4s, v2.4s -ldr q2, [x0, #656] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v16.4s -ldr q29, [x0, #576] -mla v1.4S, v5.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -ldr q16, [x0, #704] -mul v27.4S, v27.4S,v28.s[0] -sub v5.4s, v29.4s, v30.4s -ldr q24, [x0, #592] -mul v22.4S, v22.4S,v28.s[0] -add v29.4s, v29.4s, v30.4s -ldr q30, [x0, #720] -mla v27.4S, v11.4S, v31.s[0] -mla v22.4S, v4.4S, v31.s[0] -sub v4.4s, v24.4s, v18.4s -sqrdmulh v11.4S, v9.4S, v17.s[1] -add v24.4s, v24.4s, v18.4s -mul v9.4S, v9.4S,v20.s[1] -sqrdmulh v18.4S, v6.4S, v17.s[2] -sub v3.4s, v19.4s, v8.4s -mul v6.4S, v6.4S,v20.s[2] -add v19.4s, v19.4s, v8.4s -sqrdmulh v17.4S, v24.4S, v13.s[1] -sub v20.4s, v2.4s, v1.4s -mul v24.4S, v24.4S,v15.s[1] -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v4.4S, v13.s[2] -sub v8.4s, v16.4s, v27.4s -mul v4.4S, v4.4S,v15.s[2] -add v16.4s, v16.4s, v27.4s -mla v9.4S, v11.4S, v31.s[0] -sub v11.4s, v30.4s, v22.4s -ldr q13, [x0, #992] -sqrdmulh v15.4S, v2.4S, v7.s[1] -add v30.4s, v30.4s, v22.4s -mla v6.4S, v18.4S, v31.s[0] -ldr q18, [x0, #928] -sqrdmulh v22.4S, v20.4S, v7.s[2] -sub v27.4s, v23.4s, v9.4s -mla v24.4S, v17.4S, v31.s[0] -ldr q17, [x0, #800] -sqrdmulh v25.4S, v30.4S, v14.s[1] -add v23.4s, v23.4s, v9.4s -str q27, [x0, #528] -mla v4.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+512] -sqrdmulh v27.4S, v11.4S, v14.s[2] -sub v9.4s, v26.4s, v6.4s -str q23, [x0, #512] -mul v2.4S, v2.4S,v10.s[1] -add v26.4s, v26.4s, v6.4s -ldr q6, [x17, #+528] -mul v20.4S, v20.4S,v10.s[2] -str q9, [x0, #560] -mla v2.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v24.4s -mla v20.4S, v22.4S, v31.s[0] -str q26, [x0, #544] -mul v30.4S, v30.4S,v28.s[1] -str q15, [x0, #592] -mul v11.4S, v11.4S,v28.s[2] -add v29.4s, v29.4s, v24.4s -str q29, [x0, #576] -mla v30.4S, v25.4S, v31.s[0] -sub v25.4s, v5.4s, v4.4s -str q25, [x0, #624] -mla v11.4S, v27.4S, v31.s[0] -add v5.4s, v5.4s, v4.4s -str q5, [x0, #608] -sqrdmulh v14.4S, v17.4S, v6.s[0] -sub v28.4s, v19.4s, v2.4s -mul v17.4S, v17.4S,v1.s[0] -str q28, [x0, #656] -ldr q28, [x0, #816] -sqrdmulh v5.4S, v28.4S, v6.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x17, #+544] -mul v28.4S, v28.4S,v1.s[0] -str q19, [x0, #640] -ldr q19, [x17, #+560] -ldr q4, [x0, #864] -sqrdmulh v27.4S, v4.4S, v19.s[0] -sub v25.4s, v3.4s, v20.4s -mul v4.4S, v4.4S,v2.s[0] -str q25, [x0, #688] -ldr q25, [x0, #880] -sqrdmulh v29.4S, v25.4S, v19.s[0] -add v3.4s, v3.4s, v20.4s -mul v25.4S, v25.4S,v2.s[0] -str q3, [x0, #672] -ldr q3, [x17, #+576] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v30.4s -ldr q20, [x17, #+592] -sqrdmulh v24.4S, v18.4S, v20.s[0] -str q14, [x0, #720] -ldr q14, [x0, #944] -mla v28.4S, v5.4S, v31.s[0] -add v16.4s, v16.4s, v30.4s -sqrdmulh v30.4S, v14.4S, v20.s[0] -str q16, [x0, #704] -ldr q16, [x17, #+608] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v8.4s, v11.4s -ldr q5, [x17, #+624] -sqrdmulh v15.4S, v13.4S, v5.s[0] -str q27, [x0, #752] -ldr q27, [x0, #1008] -mla v25.4S, v29.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -sqrdmulh v11.4S, v27.4S, v5.s[0] -str q8, [x0, #736] -ldr q8, [x0, #768] -ldr q29, [x0, #896] -mul v18.4S, v18.4S,v3.s[0] -sub v7.4s, v8.4s, v17.4s -ldr q10, [x0, #784] -mul v14.4S, v14.4S,v3.s[0] -add v8.4s, v8.4s, v17.4s -ldr q17, [x0, #912] -mla v18.4S, v24.4S, v31.s[0] -sub v24.4s, v10.4s, v28.4s -ldr q26, [x0, #832] -mla v14.4S, v30.4S, v31.s[0] -add v10.4s, v10.4s, v28.4s -ldr q28, [x0, #960] -mul v13.4S, v13.4S,v16.s[0] -sub v30.4s, v26.4s, v4.4s -ldr q22, [x0, #848] -mul v27.4S, v27.4S,v16.s[0] -add v26.4s, v26.4s, v4.4s -ldr q4, [x0, #976] -mla v13.4S, v15.4S, v31.s[0] -mla v27.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v25.4s -sqrdmulh v15.4S, v10.4S, v6.s[1] -add v22.4s, v22.4s, v25.4s -mul v10.4S, v10.4S,v1.s[1] -sqrdmulh v25.4S, v24.4S, v6.s[2] -sub v9.4s, v29.4s, v18.4s -mul v24.4S, v24.4S,v1.s[2] -add v29.4s, v29.4s, v18.4s -sqrdmulh v6.4S, v22.4S, v19.s[1] -sub v1.4s, v17.4s, v14.4s -mul v22.4S, v22.4S,v2.s[1] -add v17.4s, v17.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v19.s[2] -sub v18.4s, v28.4s, v13.4s -mul v11.4S, v11.4S,v2.s[2] -add v28.4s, v28.4s, v13.4s -mla v10.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v27.4s -sqrdmulh v19.4S, v17.4S, v20.s[1] -add v4.4s, v4.4s, v27.4s -mla v24.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v1.4S, v20.s[2] -sub v27.4s, v8.4s, v10.4s -mla v22.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v4.4S, v5.s[1] -add v8.4s, v8.4s, v10.4s -str q27, [x0, #784] -mla v11.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v15.4S, v5.s[2] -sub v27.4s, v7.4s, v24.4s -str q8, [x0, #768] -mul v17.4S, v17.4S,v3.s[1] -add v7.4s, v7.4s, v24.4s -mul v1.4S, v1.4S,v3.s[2] -str q27, [x0, #816] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v26.4s, v22.4s -mla v1.4S, v25.4S, v31.s[0] -str q7, [x0, #800] -mul v4.4S, v4.4S,v16.s[1] -str q19, [x0, #848] -mul v15.4S, v15.4S,v16.s[2] -add v26.4s, v26.4s, v22.4s -str q26, [x0, #832] -mla v4.4S, v6.4S, v31.s[0] -sub v6.4s, v30.4s, v11.4s -str q6, [x0, #880] -mla v15.4S, v14.4S, v31.s[0] -add v30.4s, v30.4s, v11.4s -str q30, [x0, #864] -sub v5.4s, v29.4s, v17.4s -str q5, [x0, #912] -add v29.4s, v29.4s, v17.4s -str q29, [x0, #896] -sub v29.4s, v9.4s, v1.4s -str q29, [x0, #944] -add v9.4s, v9.4s, v1.4s -str q9, [x0, #928] -sub v9.4s, v28.4s, v4.4s -str q9, [x0, #976] -add v28.4s, v28.4s, v4.4s -str q28, [x0, #960] -sub v28.4s, v18.4s, v15.4s -str q28, [x0, #1008] -add v18.4s, v18.4s, v15.4s -str q18, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1520 -// Instruction count: 1516 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s deleted file mode 100644 index 6c246f1..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_22_z4_9.s +++ /dev/null @@ -1,1558 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9 -.global _ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9 -ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9: -_ntt_u32_incomplete_neon_asm_var_4_2_22_z4_9: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x0, #992] -sqrdmulh v27.4S, v28.4S, v29.s[0] -mul v28.4S, v28.4S,v30.s[0] -ldr q26, [x0, #928] -sqrdmulh v25.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -ldr q24, [x0, #864] -sqrdmulh v23.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -ldr q22, [x0, #800] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #736] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mla v28.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v18.4S, v27.4S, v29.s[0] -mla v26.4S, v25.4S, v31.s[0] -ldr q25, [x0, #608] -sqrdmulh v17.4S, v25.4S, v29.s[0] -mla v24.4S, v23.4S, v31.s[0] -ldr q23, [x0, #544] -sqrdmulh v16.4S, v23.4S, v29.s[0] -mla v22.4S, v21.4S, v31.s[0] -ldr q21, [x0, #480] -mul v27.4S, v27.4S,v30.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q3, [x0, #416] -ldr q2, [x0, #352] -ldr q1, [x0, #288] -mla v27.4S, v18.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -ldr q19, [x0, #224] -ldr q18, [x0, #160] -mul v23.4S, v23.4S,v30.s[0] -mul v25.4S, v25.4S,v30.s[0] -ldr q0, [x0, #96] -ldr q15, [x0, #32] -mla v23.4S, v16.4S, v31.s[0] -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v28.4s -add v21.4s, v21.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v16.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v14.4s, v2.4s, v24.4s -add v2.4s, v2.4s, v24.4s -sqrdmulh v24.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v12.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v27.4s -add v18.4s, v18.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v29.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v0.4s, v25.4s -add v0.4s, v0.4s, v25.4s -sqrdmulh v25.4S, v2.4S, v29.s[1] -mla v21.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v23.4s -sqrdmulh v11.4S, v1.4S, v29.s[1] -mla v3.4S, v22.4S, v31.s[0] -add v15.4s, v15.4s, v23.4s -ldr q23, [x17, #+32] -ldr q22, [x17, #+48] -mul v13.4S, v13.4S,v30.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v10.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -mla v13.4S, v27.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -mul v2.4S, v2.4S,v30.s[1] -sub v16.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -mla v1.4S, v11.4S, v31.s[0] -mla v2.4S, v25.4S, v31.s[0] -sub v25.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v11.4s, v26.4s, v14.4s -add v26.4s, v26.4s, v14.4s -sqrdmulh v14.4S, v12.4S, v22.s[2] -mul v12.4S, v12.4S,v23.s[2] -sub v21.4s, v24.4s, v13.4s -add v24.4s, v24.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v22.s[1] -mul v16.4S, v16.4S,v23.s[1] -sub v27.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -ldr q1, [x17, #+96] -ldr q9, [x17, #+112] -sqrdmulh v8.4S, v20.4S, v22.s[3] -mla v10.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v28.4S, v22.s[2] -mla v12.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v25.4S, v22.s[1] -mla v16.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v18.4S, v22.s[0] -mla v19.4S, v2.4S, v31.s[0] -nop -nop -ldr q2, [x17, #+64] -ldr q7, [x17, #+80] -mul v28.4S, v28.4S,v23.s[2] -mul v20.4S, v20.4S,v23.s[3] -sub v6.4s, v11.4s, v10.4s -add v11.4s, v11.4s, v10.4s -mla v28.4S, v3.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v26.4s, v12.4s -add v26.4s, v26.4s, v12.4s -mul v18.4S, v18.4S,v23.s[0] -mul v25.4S, v25.4S,v23.s[1] -sub v12.4s, v27.4s, v16.4s -add v27.4s, v27.4s, v16.4s -mla v18.4S, v13.4S, v31.s[0] -mla v25.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v9.s[3] -mul v6.4S, v6.4S,v1.s[3] -sub v13.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v9.s[2] -mul v11.4S, v11.4S,v1.s[2] -sub v16.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v8.4S, v9.s[1] -mul v8.4S, v8.4S,v1.s[1] -sub v3.4s, v17.4s, v25.4s -add v17.4s, v17.4s, v25.4s -sqrdmulh v25.4S, v26.4S, v9.s[0] -mul v26.4S, v26.4S,v1.s[0] -sub v10.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v7.s[3] -mla v6.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v27.4S, v7.s[2] -mla v11.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v14.4S, v7.s[1] -mla v8.4S, v28.4S, v31.s[0] -nop -nop -sqrdmulh v28.4S, v0.4S, v7.s[0] -mla v26.4S, v25.4S, v31.s[0] -nop -nop -mul v27.4S, v27.4S,v2.s[2] -mul v12.4S, v12.4S,v2.s[3] -sub v25.4s, v13.4s, v6.4s -str q25, [x0, #992] -mla v27.4S, v19.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -add v13.4s, v13.4s, v6.4s -str q13, [x0, #928] -mul v0.4S, v0.4S,v2.s[0] -mul v14.4S, v14.4S,v2.s[1] -sub v13.4s, v21.4s, v11.4s -str q13, [x0, #864] -mla v0.4S, v28.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sub v11.4s, v16.4s, v8.4s -ldr q20, [x0, #1008] -sqrdmulh v28.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v8.4s -str q21, [x0, #800] -ldr q21, [x0, #944] -sqrdmulh v8.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v13.4s, v24.4s, v26.4s -str q11, [x0, #736] -ldr q11, [x0, #880] -sqrdmulh v6.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -add v24.4s, v24.4s, v26.4s -str q16, [x0, #672] -ldr q16, [x0, #816] -sqrdmulh v26.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v18.4s, v3.4s, v12.4s -str q13, [x0, #608] -ldr q13, [x0, #752] -sqrdmulh v19.4S, v13.4S, v29.s[0] -mla v20.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v12.4s -str q24, [x0, #544] -ldr q24, [x0, #688] -sqrdmulh v12.4S, v24.4S, v29.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v27.4s -str q18, [x0, #480] -ldr q18, [x0, #624] -sqrdmulh v28.4S, v18.4S, v29.s[0] -mla v11.4S, v6.4S, v31.s[0] -add v17.4s, v17.4s, v27.4s -str q3, [x0, #416] -ldr q3, [x0, #560] -sqrdmulh v27.4S, v3.4S, v29.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v10.4s, v14.4s -str q8, [x0, #352] -ldr q8, [x0, #496] -add v10.4s, v10.4s, v14.4s -mul v24.4S, v24.4S,v30.s[0] -mul v13.4S, v13.4S,v30.s[0] -ldr q14, [x0, #432] -str q17, [x0, #288] -ldr q17, [x0, #368] -ldr q6, [x0, #304] -mla v24.4S, v12.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -str q26, [x0, #224] -sub v26.4s, v15.4s, v0.4s -ldr q19, [x0, #240] -ldr q12, [x0, #176] -mul v3.4S, v3.4S,v30.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q10, [x0, #160] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #112] -ldr q10, [x0, #48] -mla v3.4S, v27.4S, v31.s[0] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v29.s[2] -mul v28.4S, v28.4S,v30.s[2] -sub v27.4s, v14.4s, v21.4s -add v14.4s, v14.4s, v21.4s -sqrdmulh v21.4S, v27.4S, v29.s[2] -mul v27.4S, v27.4S,v30.s[2] -sub v25.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v5.4s, v6.4s, v16.4s -add v6.4s, v6.4s, v16.4s -sqrdmulh v16.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -sub v4.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v25.4S, v29.s[2] -mla v28.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v29.s[2] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v18.4s -add v0.4s, v0.4s, v18.4s -sqrdmulh v18.4S, v17.4S, v29.s[1] -mla v8.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v3.4s -str q26, [x0, #96] -sqrdmulh v26.4S, v6.4S, v29.s[1] -mla v14.4S, v16.4S, v31.s[0] -add v10.4s, v10.4s, v3.4s -str q15, [x0, #32] -mul v5.4S, v5.4S,v30.s[2] -mul v25.4S, v25.4S,v30.s[2] -sub v15.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -mla v5.4S, v24.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v27.4s -add v20.4s, v20.4s, v27.4s -mul v6.4S, v6.4S,v30.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v27.4s, v19.4s, v8.4s -add v19.4s, v19.4s, v8.4s -mla v6.4S, v26.4S, v31.s[0] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v14.4s -add v12.4s, v12.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v26.4s, v21.4s, v25.4s -add v21.4s, v21.4s, v25.4s -sqrdmulh v25.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v23.s[2] -sub v8.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v27.4S, v22.s[1] -mul v27.4S, v27.4S,v23.s[1] -sub v24.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v28.4s, v10.4s, v6.4s -add v10.4s, v10.4s, v6.4s -sqrdmulh v6.4S, v13.4S, v22.s[3] -mla v15.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v20.4S, v22.s[2] -mla v4.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v18.4S, v22.s[1] -mla v27.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v12.4S, v22.s[0] -mla v19.4S, v17.4S, v31.s[0] -nop -nop -mul v20.4S, v20.4S,v23.s[2] -mul v13.4S, v13.4S,v23.s[3] -sub v17.4s, v26.4s, v15.4s -add v26.4s, v26.4s, v15.4s -mla v20.4S, v14.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -mul v12.4S, v12.4S,v23.s[0] -mul v18.4S, v18.4S,v23.s[1] -sub v4.4s, v24.4s, v27.4s -add v24.4s, v24.4s, v27.4s -mla v12.4S, v5.4S, v31.s[0] -mla v18.4S, v25.4S, v31.s[0] -sub v25.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v9.s[3] -mul v17.4S, v17.4S,v1.s[3] -sub v5.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -sqrdmulh v13.4S, v26.4S, v9.s[2] -mul v26.4S, v26.4S,v1.s[2] -sub v27.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v6.4S, v9.s[1] -mul v6.4S, v6.4S,v1.s[1] -sub v14.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v1.s[0] -sub v15.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v7.s[3] -mla v17.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v24.4S, v7.s[2] -mla v26.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v25.4S, v7.s[1] -mla v6.4S, v20.4S, v31.s[0] -nop -nop -sqrdmulh v20.4S, v0.4S, v7.s[0] -mla v21.4S, v18.4S, v31.s[0] -nop -nop -mul v24.4S, v24.4S,v2.s[2] -mul v4.4S, v4.4S,v2.s[3] -sub v18.4s, v5.4s, v17.4s -str q18, [x0, #1008] -mla v24.4S, v19.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -add v5.4s, v5.4s, v17.4s -str q5, [x0, #944] -mul v0.4S, v0.4S,v2.s[0] -mul v25.4S, v25.4S,v2.s[1] -sub v5.4s, v8.4s, v26.4s -str q5, [x0, #880] -mla v0.4S, v20.4S, v31.s[0] -mla v25.4S, v13.4S, v31.s[0] -add v8.4s, v8.4s, v26.4s -sub v26.4s, v27.4s, v6.4s -ldr q13, [x0, #960] -sqrdmulh v20.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -add v27.4s, v27.4s, v6.4s -str q8, [x0, #816] -ldr q8, [x0, #896] -sqrdmulh v6.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v5.4s, v11.4s, v21.4s -str q26, [x0, #752] -ldr q26, [x0, #832] -sqrdmulh v17.4S, v26.4S, v29.s[0] -mul v26.4S, v26.4S,v30.s[0] -add v11.4s, v11.4s, v21.4s -str q27, [x0, #688] -ldr q27, [x0, #768] -sqrdmulh v21.4S, v27.4S, v29.s[0] -mul v27.4S, v27.4S,v30.s[0] -sub v12.4s, v14.4s, v4.4s -str q5, [x0, #624] -ldr q5, [x0, #704] -sqrdmulh v19.4S, v5.4S, v29.s[0] -mla v13.4S, v20.4S, v31.s[0] -add v14.4s, v14.4s, v4.4s -str q11, [x0, #560] -ldr q11, [x0, #640] -sqrdmulh v4.4S, v11.4S, v29.s[0] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v24.4s -str q12, [x0, #496] -ldr q12, [x0, #576] -sqrdmulh v20.4S, v12.4S, v29.s[0] -mla v26.4S, v17.4S, v31.s[0] -add v28.4s, v28.4s, v24.4s -str q14, [x0, #432] -ldr q14, [x0, #512] -sqrdmulh v24.4S, v14.4S, v29.s[0] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v25.4s -str q6, [x0, #368] -ldr q6, [x0, #448] -add v15.4s, v15.4s, v25.4s -mul v11.4S, v11.4S,v30.s[0] -mul v5.4S, v5.4S,v30.s[0] -ldr q25, [x0, #384] -str q28, [x0, #304] -ldr q28, [x0, #320] -ldr q17, [x0, #256] -mla v11.4S, v4.4S, v31.s[0] -mla v5.4S, v19.4S, v31.s[0] -str q21, [x0, #240] -sub v21.4s, v10.4s, v0.4s -ldr q19, [x0, #192] -ldr q4, [x0, #128] -mul v14.4S, v14.4S,v30.s[0] -mul v12.4S, v12.4S,v30.s[0] -str q15, [x0, #176] -add v10.4s, v10.4s, v0.4s -ldr q0, [x0, #64] -ldr q15, [x0, #0] -mla v14.4S, v24.4S, v31.s[0] -mla v12.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v24.4s, v25.4s, v8.4s -add v25.4s, v25.4s, v8.4s -sqrdmulh v8.4S, v24.4S, v29.s[2] -mul v24.4S, v24.4S,v30.s[2] -sub v18.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v26.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -sub v3.4s, v17.4s, v27.4s -add v17.4s, v17.4s, v27.4s -sqrdmulh v27.4S, v25.4S, v29.s[1] -mul v25.4S, v25.4S,v30.s[1] -sub v16.4s, v19.4s, v5.4s -add v19.4s, v19.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v29.s[2] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v11.4s -add v4.4s, v4.4s, v11.4s -sqrdmulh v11.4S, v3.4S, v29.s[2] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v12.4s -add v0.4s, v0.4s, v12.4s -sqrdmulh v12.4S, v28.4S, v29.s[1] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v14.4s -str q21, [x0, #112] -sqrdmulh v21.4S, v17.4S, v29.s[1] -mla v25.4S, v27.4S, v31.s[0] -add v15.4s, v15.4s, v14.4s -str q10, [x0, #48] -mul v3.4S, v3.4S,v30.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v10.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -mla v3.4S, v11.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v13.4s, v24.4s -add v13.4s, v13.4s, v24.4s -mul v17.4S, v17.4S,v30.s[1] -mul v28.4S, v28.4S,v30.s[1] -sub v24.4s, v19.4s, v6.4s -add v19.4s, v19.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v28.4S, v12.4S, v31.s[0] -sub v12.4s, v4.4s, v25.4s -add v4.4s, v4.4s, v25.4s -sqrdmulh v25.4S, v10.4S, v22.s[3] -mul v10.4S, v10.4S,v23.s[3] -sub v21.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v22.s[2] -mul v16.4S, v16.4S,v23.s[2] -sub v6.4s, v26.4s, v3.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v24.4S, v22.s[1] -mul v24.4S, v24.4S,v23.s[1] -sub v11.4s, v0.4s, v28.4s -add v0.4s, v0.4s, v28.4s -sqrdmulh v28.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v20.4s, v15.4s, v17.4s -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v22.s[3] -mla v10.4S, v25.4S, v31.s[0] -nop -nop -sqrdmulh v25.4S, v13.4S, v22.s[2] -mla v16.4S, v18.4S, v31.s[0] -nop -nop -sqrdmulh v18.4S, v12.4S, v22.s[1] -mla v24.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v4.4S, v22.s[0] -mla v19.4S, v28.4S, v31.s[0] -nop -nop -mul v13.4S, v13.4S,v23.s[2] -mul v5.4S, v5.4S,v23.s[3] -sub v28.4s, v21.4s, v10.4s -add v21.4s, v21.4s, v10.4s -mla v13.4S, v25.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v8.4s, v16.4s -add v8.4s, v8.4s, v16.4s -mul v4.4S, v4.4S,v23.s[0] -mul v12.4S, v12.4S,v23.s[1] -sub v16.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -mla v4.4S, v3.4S, v31.s[0] -mla v12.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v28.4S, v9.s[3] -mul v28.4S, v28.4S,v1.s[3] -sub v3.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v21.4S, v9.s[2] -mul v21.4S, v21.4S,v1.s[2] -sub v24.4s, v26.4s, v13.4s -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v9.s[1] -mul v17.4S, v17.4S,v1.s[1] -sub v25.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v8.4S, v9.s[0] -mul v8.4S, v8.4S,v1.s[0] -sub v10.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v7.s[3] -mla v28.4S, v19.4S, v31.s[0] -nop -nop -sqrdmulh v19.4S, v11.4S, v7.s[2] -mla v21.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v18.4S, v7.s[1] -mla v17.4S, v13.4S, v31.s[0] -nop -nop -sqrdmulh v13.4S, v0.4S, v7.s[0] -mla v8.4S, v12.4S, v31.s[0] -nop -nop -mul v11.4S, v11.4S,v2.s[2] -mul v16.4S, v16.4S,v2.s[3] -sub v12.4s, v3.4s, v28.4s -str q12, [x0, #960] -mla v11.4S, v19.4S, v31.s[0] -mla v16.4S, v4.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -str q3, [x0, #896] -mul v0.4S, v0.4S,v2.s[0] -mul v18.4S, v18.4S,v2.s[1] -sub v3.4s, v6.4s, v21.4s -str q3, [x0, #832] -mla v0.4S, v13.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -add v6.4s, v6.4s, v21.4s -sub v21.4s, v24.4s, v17.4s -ldr q5, [x0, #976] -sqrdmulh v13.4S, v5.4S, v29.s[0] -mul v5.4S, v5.4S,v30.s[0] -add v24.4s, v24.4s, v17.4s -str q6, [x0, #768] -ldr q6, [x0, #912] -sqrdmulh v17.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v3.4s, v26.4s, v8.4s -str q21, [x0, #704] -ldr q21, [x0, #848] -sqrdmulh v28.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -add v26.4s, v26.4s, v8.4s -str q24, [x0, #640] -ldr q24, [x0, #784] -sqrdmulh v8.4S, v24.4S, v29.s[0] -mul v24.4S, v24.4S,v30.s[0] -sub v4.4s, v25.4s, v16.4s -str q3, [x0, #576] -ldr q3, [x0, #720] -sqrdmulh v19.4S, v3.4S, v29.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v16.4s -str q26, [x0, #512] -ldr q26, [x0, #656] -sqrdmulh v16.4S, v26.4S, v29.s[0] -mla v6.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v11.4s -str q4, [x0, #448] -ldr q4, [x0, #592] -sqrdmulh v13.4S, v4.4S, v29.s[0] -mla v21.4S, v28.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q25, [x0, #384] -ldr q25, [x0, #528] -sqrdmulh v11.4S, v25.4S, v29.s[0] -mla v24.4S, v8.4S, v31.s[0] -sub v8.4s, v10.4s, v18.4s -str q17, [x0, #320] -ldr q17, [x0, #464] -add v10.4s, v10.4s, v18.4s -mul v26.4S, v26.4S,v30.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q18, [x0, #400] -str q20, [x0, #256] -ldr q20, [x0, #336] -ldr q28, [x0, #272] -mla v26.4S, v16.4S, v31.s[0] -mla v3.4S, v19.4S, v31.s[0] -str q8, [x0, #192] -sub v8.4s, v15.4s, v0.4s -ldr q19, [x0, #208] -ldr q16, [x0, #144] -mul v25.4S, v25.4S,v30.s[0] -mul v4.4S, v4.4S,v30.s[0] -str q10, [x0, #128] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #80] -ldr q10, [x0, #16] -mla v25.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v5.4s -add v17.4s, v17.4s, v5.4s -sqrdmulh v5.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -sub v11.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v12.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v28.4s, v24.4s -add v28.4s, v28.4s, v24.4s -sqrdmulh v24.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v27.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sqrdmulh v3.4S, v12.4S, v29.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v26.4s -add v16.4s, v16.4s, v26.4s -sqrdmulh v26.4S, v14.4S, v29.s[2] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v20.4S, v29.s[1] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v25.4s -str q8, [x0, #64] -sqrdmulh v8.4S, v28.4S, v29.s[1] -mla v18.4S, v24.4S, v31.s[0] -add v10.4s, v10.4s, v25.4s -str q15, [x0, #0] -mul v14.4S, v14.4S,v30.s[2] -mul v12.4S, v12.4S,v30.s[2] -sub v15.4s, v27.4s, v13.4s -add v27.4s, v27.4s, v13.4s -mla v14.4S, v26.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -mul v28.4S, v28.4S,v30.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v11.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -mla v28.4S, v8.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -sqrdmulh v29.4S, v15.4S, v22.s[3] -mul v15.4S, v15.4S,v23.s[3] -sub v30.4s, v6.4s, v12.4s -add v6.4s, v6.4s, v12.4s -sqrdmulh v12.4S, v27.4S, v22.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v18.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v11.4S, v22.s[1] -mul v11.4S, v11.4S,v23.s[1] -sub v8.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v22.s[0] -mul v19.4S, v19.4S,v23.s[0] -sub v17.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v3.4S, v22.s[3] -mla v15.4S, v29.4S, v31.s[0] -nop -nop -sqrdmulh v29.4S, v5.4S, v22.s[2] -mla v27.4S, v12.4S, v31.s[0] -nop -nop -sqrdmulh v12.4S, v4.4S, v22.s[1] -mla v11.4S, v14.4S, v31.s[0] -nop -nop -sqrdmulh v14.4S, v16.4S, v22.s[0] -mla v19.4S, v20.4S, v31.s[0] -nop -nop -mul v5.4S, v5.4S,v23.s[2] -mul v3.4S, v3.4S,v23.s[3] -sub v20.4s, v30.4s, v15.4s -add v30.4s, v30.4s, v15.4s -mla v5.4S, v29.4S, v31.s[0] -mla v3.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v27.4s -add v6.4s, v6.4s, v27.4s -mul v16.4S, v16.4S,v23.s[0] -mul v4.4S, v4.4S,v23.s[1] -sub v27.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v16.4S, v14.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v22.4S, v20.4S, v9.s[3] -mul v20.4S, v20.4S,v1.s[3] -sub v23.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v30.4S, v9.s[2] -mul v30.4S, v30.4S,v1.s[2] -sub v19.4s, v21.4s, v5.4s -add v21.4s, v21.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v9.s[1] -mul v28.4S, v28.4S,v1.s[1] -sub v14.4s, v17.4s, v4.4s -add v17.4s, v17.4s, v4.4s -sqrdmulh v4.4S, v6.4S, v9.s[0] -mul v6.4S, v6.4S,v1.s[0] -sub v11.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v9.4S, v27.4S, v7.s[3] -mla v20.4S, v22.4S, v31.s[0] -nop -nop -sqrdmulh v22.4S, v8.4S, v7.s[2] -mla v30.4S, v3.4S, v31.s[0] -nop -nop -sqrdmulh v3.4S, v12.4S, v7.s[1] -mla v28.4S, v5.4S, v31.s[0] -nop -nop -sqrdmulh v5.4S, v0.4S, v7.s[0] -mla v6.4S, v4.4S, v31.s[0] -nop -nop -mul v8.4S, v8.4S,v2.s[2] -mul v27.4S, v27.4S,v2.s[3] -sub v4.4s, v23.4s, v20.4s -str q4, [x0, #976] -mla v8.4S, v22.4S, v31.s[0] -mla v27.4S, v9.4S, v31.s[0] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #912] -mul v0.4S, v0.4S,v2.s[0] -mul v12.4S, v12.4S,v2.s[1] -sub v23.4s, v18.4s, v30.4s -str q23, [x0, #848] -mla v0.4S, v5.4S, v31.s[0] -mla v12.4S, v3.4S, v31.s[0] -add v18.4s, v18.4s, v30.4s -sub v30.4s, v19.4s, v28.4s -add v19.4s, v19.4s, v28.4s -str q18, [x0, #784] -sub v18.4s, v21.4s, v6.4s -str q30, [x0, #720] -add v21.4s, v21.4s, v6.4s -str q19, [x0, #656] -sub v19.4s, v14.4s, v27.4s -str q18, [x0, #592] -add v14.4s, v14.4s, v27.4s -str q21, [x0, #528] -sub v21.4s, v17.4s, v8.4s -str q19, [x0, #464] -add v17.4s, v17.4s, v8.4s -str q14, [x0, #400] -sub v14.4s, v11.4s, v12.4s -str q21, [x0, #336] -add v11.4s, v11.4s, v12.4s -str q17, [x0, #272] -sub v17.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -ldr q24, [x0, #224] -ldr q25, [x0, #160] -ldr q13, [x0, #32] -ldr q26, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v29.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v26.s[0] -ldr q16, [x0, #48] -sqrdmulh v1.4S, v16.4S, v15.s[0] -mul v16.4S, v16.4S,v26.s[0] -ldr q4, [x17, #+160] -ldr q22, [x17, #+176] -ldr q9, [x0, #96] -sqrdmulh v20.4S, v9.4S, v22.s[0] -mul v9.4S, v9.4S,v4.s[0] -ldr q23, [x0, #112] -sqrdmulh v5.4S, v23.4S, v22.s[0] -mul v23.4S, v23.4S,v4.s[0] -ldr q3, [x17, #+192] -ldr q2, [x17, #+208] -mla v13.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v25.4S, v2.s[0] -ldr q7, [x0, #176] -mla v16.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v7.4S, v2.s[0] -ldr q28, [x17, #+224] -ldr q30, [x17, #+240] -mla v9.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v24.4S, v30.s[0] -ldr q6, [x0, #240] -mla v23.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v6.4S, v30.s[0] -ldr q18, [x0, #0] -ldr q27, [x0, #128] -mul v25.4S, v25.4S,v3.s[0] -sub v19.4s, v18.4s, v13.4s -mul v7.4S, v7.4S,v3.s[0] -add v18.4s, v18.4s, v13.4s -mla v25.4S, v29.4S, v31.s[0] -sub v29.4s, v10.4s, v16.4s -ldr q13, [x0, #64] -mla v7.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v16.4s -ldr q16, [x0, #192] -mul v24.4S, v24.4S,v28.s[0] -sub v1.4s, v13.4s, v9.4s -mul v6.4S, v6.4S,v28.s[0] -add v13.4s, v13.4s, v9.4s -mla v24.4S, v20.4S, v31.s[0] -nop -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v17.4s, v23.4s -sqrdmulh v20.4S, v10.4S, v15.s[1] -add v17.4s, v17.4s, v23.4s -mul v10.4S, v10.4S,v26.s[1] -nop -sqrdmulh v23.4S, v29.4S, v15.s[2] -sub v9.4s, v27.4s, v25.4s -mul v29.4S, v29.4S,v26.s[2] -add v27.4s, v27.4s, v25.4s -sqrdmulh v15.4S, v17.4S, v22.s[1] -sub v26.4s, v11.4s, v7.4s -mul v17.4S, v17.4S,v4.s[1] -add v11.4s, v11.4s, v7.4s -sqrdmulh v7.4S, v5.4S, v22.s[2] -sub v25.4s, v16.4s, v24.4s -mul v5.4S, v5.4S,v4.s[2] -add v16.4s, v16.4s, v24.4s -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v14.4s, v6.4s -ldr q22, [x0, #480] -sqrdmulh v4.4S, v11.4S, v2.s[1] -add v14.4s, v14.4s, v6.4s -mla v29.4S, v23.4S, v31.s[0] -ldr q23, [x0, #416] -sqrdmulh v6.4S, v26.4S, v2.s[2] -sub v24.4s, v18.4s, v10.4s -mla v17.4S, v15.4S, v31.s[0] -ldr q15, [x0, #288] -sqrdmulh v8.4S, v14.4S, v30.s[1] -add v18.4s, v18.4s, v10.4s -str q24, [x0, #16] -mla v5.4S, v7.4S, v31.s[0] -ldr q7, [x17, #+256] -ldr q24, [x17, #+272] -sqrdmulh v10.4S, v20.4S, v30.s[2] -sub v21.4s, v19.4s, v29.4s -str q18, [x0, #0] -mul v11.4S, v11.4S,v3.s[1] -add v19.4s, v19.4s, v29.4s -mul v26.4S, v26.4S,v3.s[2] -str q21, [x0, #48] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v17.4s -mla v26.4S, v6.4S, v31.s[0] -str q19, [x0, #32] -mul v14.4S, v14.4S,v28.s[1] -str q4, [x0, #80] -mul v20.4S, v20.4S,v28.s[2] -add v13.4s, v13.4s, v17.4s -str q13, [x0, #64] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v5.4s -str q8, [x0, #112] -mla v20.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v5.4s -str q1, [x0, #96] -sqrdmulh v30.4S, v15.4S, v24.s[0] -sub v28.4s, v27.4s, v11.4s -mul v15.4S, v15.4S,v7.s[0] -str q28, [x0, #144] -ldr q28, [x0, #304] -sqrdmulh v1.4S, v28.4S, v24.s[0] -add v27.4s, v27.4s, v11.4s -mul v28.4S, v28.4S,v7.s[0] -str q27, [x0, #128] -ldr q27, [x17, #+288] -ldr q11, [x17, #+304] -ldr q5, [x0, #352] -sqrdmulh v10.4S, v5.4S, v11.s[0] -sub v8.4s, v9.4s, v26.4s -mul v5.4S, v5.4S,v27.s[0] -str q8, [x0, #176] -ldr q8, [x0, #368] -sqrdmulh v13.4S, v8.4S, v11.s[0] -add v9.4s, v9.4s, v26.4s -mul v8.4S, v8.4S,v27.s[0] -str q9, [x0, #160] -ldr q9, [x17, #+320] -ldr q26, [x17, #+336] -mla v15.4S, v30.4S, v31.s[0] -sub v30.4s, v16.4s, v14.4s -sqrdmulh v17.4S, v23.4S, v26.s[0] -str q30, [x0, #208] -ldr q30, [x0, #432] -mla v28.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v30.4S, v26.s[0] -str q16, [x0, #192] -ldr q16, [x17, #+352] -ldr q1, [x17, #+368] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v25.4s, v20.4s -sqrdmulh v4.4S, v22.4S, v1.s[0] -str q10, [x0, #240] -ldr q10, [x0, #496] -mla v8.4S, v13.4S, v31.s[0] -add v25.4s, v25.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v1.s[0] -str q25, [x0, #224] -ldr q25, [x0, #256] -ldr q13, [x0, #384] -mul v23.4S, v23.4S,v9.s[0] -sub v2.4s, v25.4s, v15.4s -ldr q3, [x0, #272] -mul v30.4S, v30.4S,v9.s[0] -add v25.4s, v25.4s, v15.4s -ldr q15, [x0, #400] -mla v23.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v28.4s -ldr q19, [x0, #320] -mla v30.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -ldr q28, [x0, #448] -mul v22.4S, v22.4S,v16.s[0] -sub v14.4s, v19.4s, v5.4s -ldr q6, [x0, #336] -mul v10.4S, v10.4S,v16.s[0] -add v19.4s, v19.4s, v5.4s -ldr q5, [x0, #464] -mla v22.4S, v4.4S, v31.s[0] -nop -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v6.4s, v8.4s -sqrdmulh v4.4S, v3.4S, v24.s[1] -add v6.4s, v6.4s, v8.4s -mul v3.4S, v3.4S,v7.s[1] -nop -sqrdmulh v8.4S, v17.4S, v24.s[2] -sub v21.4s, v13.4s, v23.4s -mul v17.4S, v17.4S,v7.s[2] -add v13.4s, v13.4s, v23.4s -sqrdmulh v24.4S, v6.4S, v11.s[1] -sub v7.4s, v15.4s, v30.4s -mul v6.4S, v6.4S,v27.s[1] -add v15.4s, v15.4s, v30.4s -sqrdmulh v30.4S, v20.4S, v11.s[2] -sub v23.4s, v28.4s, v22.4s -mul v20.4S, v20.4S,v27.s[2] -add v28.4s, v28.4s, v22.4s -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v10.4s -ldr q11, [x0, #736] -sqrdmulh v27.4S, v15.4S, v26.s[1] -add v5.4s, v5.4s, v10.4s -mla v17.4S, v8.4S, v31.s[0] -ldr q8, [x0, #672] -sqrdmulh v10.4S, v7.4S, v26.s[2] -sub v22.4s, v25.4s, v3.4s -mla v6.4S, v24.4S, v31.s[0] -ldr q24, [x0, #544] -sqrdmulh v29.4S, v5.4S, v1.s[1] -add v25.4s, v25.4s, v3.4s -str q22, [x0, #272] -mla v20.4S, v30.4S, v31.s[0] -ldr q30, [x17, #+384] -ldr q22, [x17, #+400] -sqrdmulh v3.4S, v4.4S, v1.s[2] -sub v18.4s, v2.4s, v17.4s -str q25, [x0, #256] -mul v15.4S, v15.4S,v9.s[1] -add v2.4s, v2.4s, v17.4s -mul v7.4S, v7.4S,v9.s[2] -str q18, [x0, #304] -mla v15.4S, v27.4S, v31.s[0] -sub v27.4s, v19.4s, v6.4s -mla v7.4S, v10.4S, v31.s[0] -str q2, [x0, #288] -mul v5.4S, v5.4S,v16.s[1] -str q27, [x0, #336] -mul v4.4S, v4.4S,v16.s[2] -add v19.4s, v19.4s, v6.4s -str q19, [x0, #320] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v14.4s, v20.4s -str q29, [x0, #368] -mla v4.4S, v3.4S, v31.s[0] -add v14.4s, v14.4s, v20.4s -str q14, [x0, #352] -sqrdmulh v1.4S, v24.4S, v22.s[0] -sub v16.4s, v13.4s, v15.4s -mul v24.4S, v24.4S,v30.s[0] -str q16, [x0, #400] -ldr q16, [x0, #560] -sqrdmulh v14.4S, v16.4S, v22.s[0] -add v13.4s, v13.4s, v15.4s -mul v16.4S, v16.4S,v30.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+416] -ldr q15, [x17, #+432] -ldr q20, [x0, #608] -sqrdmulh v3.4S, v20.4S, v15.s[0] -sub v29.4s, v21.4s, v7.4s -mul v20.4S, v20.4S,v13.s[0] -str q29, [x0, #432] -ldr q29, [x0, #624] -sqrdmulh v19.4S, v29.4S, v15.s[0] -add v21.4s, v21.4s, v7.4s -mul v29.4S, v29.4S,v13.s[0] -str q21, [x0, #416] -ldr q21, [x17, #+448] -ldr q7, [x17, #+464] -mla v24.4S, v1.4S, v31.s[0] -sub v1.4s, v28.4s, v5.4s -sqrdmulh v6.4S, v8.4S, v7.s[0] -str q1, [x0, #464] -ldr q1, [x0, #688] -mla v16.4S, v14.4S, v31.s[0] -add v28.4s, v28.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v7.s[0] -str q28, [x0, #448] -ldr q28, [x17, #+480] -ldr q14, [x17, #+496] -mla v20.4S, v3.4S, v31.s[0] -sub v3.4s, v23.4s, v4.4s -sqrdmulh v27.4S, v11.4S, v14.s[0] -str q3, [x0, #496] -ldr q3, [x0, #752] -mla v29.4S, v19.4S, v31.s[0] -add v23.4s, v23.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v14.s[0] -str q23, [x0, #480] -ldr q23, [x0, #512] -ldr q19, [x0, #640] -mul v8.4S, v8.4S,v21.s[0] -sub v26.4s, v23.4s, v24.4s -ldr q9, [x0, #528] -mul v1.4S, v1.4S,v21.s[0] -add v23.4s, v23.4s, v24.4s -ldr q24, [x0, #656] -mla v8.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v16.4s -ldr q2, [x0, #576] -mla v1.4S, v5.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -ldr q16, [x0, #704] -mul v11.4S, v11.4S,v28.s[0] -sub v5.4s, v2.4s, v20.4s -ldr q10, [x0, #592] -mul v3.4S, v3.4S,v28.s[0] -add v2.4s, v2.4s, v20.4s -ldr q20, [x0, #720] -mla v11.4S, v27.4S, v31.s[0] -nop -mla v3.4S, v4.4S, v31.s[0] -sub v4.4s, v10.4s, v29.4s -sqrdmulh v27.4S, v9.4S, v22.s[1] -add v10.4s, v10.4s, v29.4s -mul v9.4S, v9.4S,v30.s[1] -nop -sqrdmulh v29.4S, v6.4S, v22.s[2] -sub v18.4s, v19.4s, v8.4s -mul v6.4S, v6.4S,v30.s[2] -add v19.4s, v19.4s, v8.4s -sqrdmulh v22.4S, v10.4S, v15.s[1] -sub v30.4s, v24.4s, v1.4s -mul v10.4S, v10.4S,v13.s[1] -add v24.4s, v24.4s, v1.4s -sqrdmulh v1.4S, v4.4S, v15.s[2] -sub v8.4s, v16.4s, v11.4s -mul v4.4S, v4.4S,v13.s[2] -add v16.4s, v16.4s, v11.4s -mla v9.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v3.4s -ldr q15, [x0, #992] -sqrdmulh v13.4S, v24.4S, v7.s[1] -add v20.4s, v20.4s, v3.4s -mla v6.4S, v29.4S, v31.s[0] -ldr q29, [x0, #928] -sqrdmulh v3.4S, v30.4S, v7.s[2] -sub v11.4s, v23.4s, v9.4s -mla v10.4S, v22.4S, v31.s[0] -ldr q22, [x0, #800] -sqrdmulh v17.4S, v20.4S, v14.s[1] -add v23.4s, v23.4s, v9.4s -str q11, [x0, #528] -mla v4.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+512] -ldr q11, [x17, #+528] -sqrdmulh v9.4S, v27.4S, v14.s[2] -sub v25.4s, v26.4s, v6.4s -str q23, [x0, #512] -mul v24.4S, v24.4S,v21.s[1] -add v26.4s, v26.4s, v6.4s -mul v30.4S, v30.4S,v21.s[2] -str q25, [x0, #560] -mla v24.4S, v13.4S, v31.s[0] -sub v13.4s, v2.4s, v10.4s -mla v30.4S, v3.4S, v31.s[0] -str q26, [x0, #544] -mul v20.4S, v20.4S,v28.s[1] -str q13, [x0, #592] -mul v27.4S, v27.4S,v28.s[2] -add v2.4s, v2.4s, v10.4s -str q2, [x0, #576] -mla v20.4S, v17.4S, v31.s[0] -sub v17.4s, v5.4s, v4.4s -str q17, [x0, #624] -mla v27.4S, v9.4S, v31.s[0] -add v5.4s, v5.4s, v4.4s -str q5, [x0, #608] -sqrdmulh v14.4S, v22.4S, v11.s[0] -sub v28.4s, v19.4s, v24.4s -mul v22.4S, v22.4S,v1.s[0] -str q28, [x0, #656] -ldr q28, [x0, #816] -sqrdmulh v5.4S, v28.4S, v11.s[0] -add v19.4s, v19.4s, v24.4s -mul v28.4S, v28.4S,v1.s[0] -str q19, [x0, #640] -ldr q19, [x17, #+544] -ldr q24, [x17, #+560] -ldr q4, [x0, #864] -sqrdmulh v9.4S, v4.4S, v24.s[0] -sub v17.4s, v18.4s, v30.4s -mul v4.4S, v4.4S,v19.s[0] -str q17, [x0, #688] -ldr q17, [x0, #880] -sqrdmulh v2.4S, v17.4S, v24.s[0] -add v18.4s, v18.4s, v30.4s -mul v17.4S, v17.4S,v19.s[0] -str q18, [x0, #672] -ldr q18, [x17, #+576] -ldr q30, [x17, #+592] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v16.4s, v20.4s -sqrdmulh v10.4S, v29.4S, v30.s[0] -str q14, [x0, #720] -ldr q14, [x0, #944] -mla v28.4S, v5.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v30.s[0] -str q16, [x0, #704] -ldr q16, [x17, #+608] -ldr q5, [x17, #+624] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v8.4s, v27.4s -sqrdmulh v13.4S, v15.4S, v5.s[0] -str q9, [x0, #752] -ldr q9, [x0, #1008] -mla v17.4S, v2.4S, v31.s[0] -add v8.4s, v8.4s, v27.4s -sqrdmulh v27.4S, v9.4S, v5.s[0] -str q8, [x0, #736] -ldr q8, [x0, #768] -ldr q2, [x0, #896] -mul v29.4S, v29.4S,v18.s[0] -sub v7.4s, v8.4s, v22.4s -ldr q21, [x0, #784] -mul v14.4S, v14.4S,v18.s[0] -add v8.4s, v8.4s, v22.4s -ldr q22, [x0, #912] -mla v29.4S, v10.4S, v31.s[0] -sub v10.4s, v21.4s, v28.4s -ldr q26, [x0, #832] -mla v14.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v28.4s -ldr q28, [x0, #960] -mul v15.4S, v15.4S,v16.s[0] -sub v20.4s, v26.4s, v4.4s -ldr q3, [x0, #848] -mul v9.4S, v9.4S,v16.s[0] -add v26.4s, v26.4s, v4.4s -ldr q4, [x0, #976] -mla v15.4S, v13.4S, v31.s[0] -nop -mla v9.4S, v27.4S, v31.s[0] -sub v27.4s, v3.4s, v17.4s -sqrdmulh v13.4S, v21.4S, v11.s[1] -add v3.4s, v3.4s, v17.4s -mul v21.4S, v21.4S,v1.s[1] -nop -sqrdmulh v17.4S, v10.4S, v11.s[2] -sub v25.4s, v2.4s, v29.4s -mul v10.4S, v10.4S,v1.s[2] -add v2.4s, v2.4s, v29.4s -sqrdmulh v11.4S, v3.4S, v24.s[1] -sub v1.4s, v22.4s, v14.4s -mul v3.4S, v3.4S,v19.s[1] -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v27.4S, v24.s[2] -sub v29.4s, v28.4s, v15.4s -mul v27.4S, v27.4S,v19.s[2] -add v28.4s, v28.4s, v15.4s -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v9.4s -sqrdmulh v24.4S, v22.4S, v30.s[1] -add v4.4s, v4.4s, v9.4s -mla v10.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v1.4S, v30.s[2] -sub v9.4s, v8.4s, v21.4s -mla v3.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v4.4S, v5.s[1] -add v8.4s, v8.4s, v21.4s -str q9, [x0, #784] -mla v27.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v13.4S, v5.s[2] -sub v9.4s, v7.4s, v10.4s -str q8, [x0, #768] -mul v22.4S, v22.4S,v18.s[1] -add v7.4s, v7.4s, v10.4s -mul v1.4S, v1.4S,v18.s[2] -str q9, [x0, #816] -mla v22.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v3.4s -mla v1.4S, v17.4S, v31.s[0] -str q7, [x0, #800] -mul v4.4S, v4.4S,v16.s[1] -str q24, [x0, #848] -mul v13.4S, v13.4S,v16.s[2] -add v26.4s, v26.4s, v3.4s -str q26, [x0, #832] -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v27.4s -str q11, [x0, #880] -mla v13.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v27.4s -str q20, [x0, #864] -sub v5.4s, v2.4s, v22.4s -str q5, [x0, #912] -add v2.4s, v2.4s, v22.4s -str q2, [x0, #896] -sub v2.4s, v25.4s, v1.4s -str q2, [x0, #944] -add v25.4s, v25.4s, v1.4s -str q25, [x0, #928] -sub v25.4s, v28.4s, v4.4s -str q25, [x0, #976] -add v28.4s, v28.4s, v4.4s -str q28, [x0, #960] -sub v28.4s, v29.4s, v13.4s -str q28, [x0, #1008] -add v29.4s, v29.4s, v13.4s -str q29, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1528 -// Instruction count: 1524 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s deleted file mode 100644 index cf16f9a..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_0.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0 -.global _ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0 -ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0: -_ntt_u32_incomplete_neon_asm_var_4_2_24_z4_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -ldr q2, [x0, #544] -ldr q1, [x0, #608] -ldr q0, [x0, #672] -ldr q15, [x0, #736] -ldr q14, [x0, #32] -ldr q13, [x0, #96] -ldr q12, [x0, #160] -ldr q11, [x0, #224] -sqrdmulh v10.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v22.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sub v22.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sub v20.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sub v19.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sub v2.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sub v1.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sub v0.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v17.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sub v16.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sub v3.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sub v18.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sub v17.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sub v21.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sub v20.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sub v10.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v11.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -mla v16.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v1.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v17.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sub v12.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sub v16.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sub v1.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sub v0.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sub v17.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v25.s[0] -mul v13.4S, v13.4S,v26.s[0] -mla v13.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v25.s[3] -mul v15.4S, v15.4S,v26.s[3] -mla v15.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v1.4S, v23.s[1] -mul v1.4S, v1.4S,v24.s[1] -mla v1.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v10.4S, v23.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sub v13.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sub v12.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sub v18.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sub v15.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sub v2.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sub v1.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sub v10.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -str q14, [x0, #32] -str q21, [x0, #96] -str q22, [x0, #160] -str q13, [x0, #224] -str q3, [x0, #288] -str q12, [x0, #352] -str q11, [x0, #416] -str q18, [x0, #480] -str q19, [x0, #544] -str q15, [x0, #608] -str q16, [x0, #672] -str q2, [x0, #736] -str q20, [x0, #800] -str q1, [x0, #864] -str q0, [x0, #928] -str q10, [x0, #992] -ldr q10, [x0, #816] -ldr q0, [x0, #880] -ldr q1, [x0, #944] -ldr q20, [x0, #1008] -ldr q2, [x0, #304] -ldr q16, [x0, #368] -ldr q15, [x0, #432] -ldr q19, [x0, #496] -ldr q18, [x0, #560] -ldr q11, [x0, #624] -ldr q12, [x0, #688] -ldr q3, [x0, #752] -ldr q13, [x0, #48] -ldr q22, [x0, #112] -ldr q21, [x0, #176] -ldr q14, [x0, #240] -sqrdmulh v17.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v10.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sub v10.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sub v0.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sub v1.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sub v20.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sub v18.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sub v11.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sub v12.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v1.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sub v15.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sub v19.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sub v2.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sub v16.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sub v0.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sub v17.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v21.4S, v27.s[0] -mul v21.4S, v21.4S,v28.s[0] -mla v21.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v14.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v11.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v12.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v16.4S, v27.s[3] -mul v16.4S, v16.4S,v28.s[3] -mla v16.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v0.4S, v27.s[3] -mul v0.4S, v0.4S,v28.s[3] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sub v21.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sub v14.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sub v3.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sub v15.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sub v11.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sub v16.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v25.s[0] -mul v22.4S, v22.4S,v26.s[0] -mla v22.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v26.s[1] -mla v21.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v3.4S, v25.s[3] -mul v3.4S, v3.4S,v26.s[3] -mla v3.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v11.4S, v23.s[1] -mul v11.4S, v11.4S,v24.s[1] -mla v11.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v17.4S, v23.s[2] -mul v17.4S, v17.4S,v24.s[2] -mla v17.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v16.4S, v23.s[3] -mul v16.4S, v16.4S,v24.s[3] -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sub v22.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sub v21.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sub v2.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sub v3.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sub v18.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sub v11.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sub v17.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -str q13, [x0, #48] -str q0, [x0, #112] -str q10, [x0, #176] -str q22, [x0, #240] -str q19, [x0, #304] -str q21, [x0, #368] -str q14, [x0, #432] -str q2, [x0, #496] -str q20, [x0, #560] -str q3, [x0, #624] -str q15, [x0, #688] -str q18, [x0, #752] -str q1, [x0, #816] -str q11, [x0, #880] -str q12, [x0, #944] -str q17, [x0, #1008] -ldr q17, [x0, #768] -ldr q12, [x0, #832] -ldr q11, [x0, #896] -ldr q1, [x0, #960] -ldr q18, [x0, #256] -ldr q15, [x0, #320] -ldr q3, [x0, #384] -ldr q20, [x0, #448] -ldr q2, [x0, #512] -ldr q14, [x0, #576] -ldr q21, [x0, #640] -ldr q19, [x0, #704] -ldr q22, [x0, #0] -ldr q10, [x0, #64] -ldr q0, [x0, #128] -ldr q13, [x0, #192] -sqrdmulh v16.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -mla v17.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sub v17.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sub v12.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sub v11.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sub v1.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sub v2.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sub v14.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sub v21.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v20.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -mla v11.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sub v3.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sub v20.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sub v18.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sub v15.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sub v11.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sub v16.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -mla v13.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v14.4S, v27.s[2] -mul v14.4S, v14.4S,v28.s[2] -mla v14.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -mla v21.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -mla v15.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sub v0.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sub v13.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sub v19.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sub v3.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sub v14.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sub v21.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sub v15.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -mla v10.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v0.4S, v25.s[1] -mul v0.4S, v0.4S,v26.s[1] -mla v0.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v19.4S, v25.s[3] -mul v19.4S, v19.4S,v26.s[3] -mla v19.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v14.4S, v23.s[1] -mul v14.4S, v14.4S,v24.s[1] -mla v14.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -mla v16.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v15.4S, v23.s[3] -mul v15.4S, v15.4S,v24.s[3] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sub v10.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sub v0.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sub v18.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sub v19.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sub v2.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sub v14.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sub v16.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -str q22, [x0, #0] -str q12, [x0, #64] -str q17, [x0, #128] -str q10, [x0, #192] -str q20, [x0, #256] -str q0, [x0, #320] -str q13, [x0, #384] -str q18, [x0, #448] -str q1, [x0, #512] -str q19, [x0, #576] -str q3, [x0, #640] -str q2, [x0, #704] -str q11, [x0, #768] -str q14, [x0, #832] -str q21, [x0, #896] -str q16, [x0, #960] -ldr q16, [x0, #784] -ldr q21, [x0, #848] -ldr q14, [x0, #912] -ldr q11, [x0, #976] -ldr q2, [x0, #272] -ldr q3, [x0, #336] -ldr q19, [x0, #400] -ldr q1, [x0, #464] -ldr q18, [x0, #528] -ldr q13, [x0, #592] -ldr q0, [x0, #656] -ldr q20, [x0, #720] -ldr q10, [x0, #16] -ldr q17, [x0, #80] -ldr q12, [x0, #144] -ldr q22, [x0, #208] -sqrdmulh v15.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v16.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sub v16.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sub v14.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sub v11.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sub v13.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sub v0.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -mla v1.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -mla v14.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v15.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sub v1.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sub v2.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sub v3.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sub v21.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sub v14.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v13.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -mla v3.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sub v12.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sub v22.4s, v1.4s, v20.4s -add v1.4s, v1.4s, v20.4s -sub v20.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sub v19.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sub v13.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sub v0.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v26.s[0] -mla v17.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v20.4S, v25.s[3] -mul v20.4S, v20.4S,v26.s[3] -mla v20.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -mla v13.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -mla v15.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sub v17.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sub v12.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sub v2.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sub v20.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sub v18.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sub v13.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sub v15.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -str q10, [x0, #16] -str q21, [x0, #80] -str q16, [x0, #144] -str q17, [x0, #208] -str q1, [x0, #272] -str q12, [x0, #336] -str q22, [x0, #400] -str q2, [x0, #464] -str q11, [x0, #528] -str q20, [x0, #592] -str q19, [x0, #656] -str q18, [x0, #720] -str q14, [x0, #784] -str q13, [x0, #848] -str q0, [x0, #912] -str q15, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q9, [x17, #+208] -ldr q3, [x17, #+224] -ldr q10, [x17, #+240] -ldr q21, [x0, #32] -ldr q16, [x0, #48] -ldr q17, [x0, #0] -ldr q1, [x0, #16] -sqrdmulh v12.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v21.4S, v31.s[0] -sub v21.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -ldr q16, [x17, #+256] -ldr q22, [x17, #+272] -sqrdmulh v2.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v4.s[1] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -str q17, [x0, #0] -str q2, [x0, #16] -str q12, [x0, #32] -str q1, [x0, #48] -ldr q1, [x0, #96] -ldr q12, [x0, #112] -ldr q2, [x0, #64] -ldr q17, [x0, #80] -sqrdmulh v21.4S, v1.4S, v7.s[0] -mul v1.4S, v1.4S,v6.s[0] -mla v1.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v7.s[0] -mul v12.4S, v12.4S,v6.s[0] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -ldr q12, [x17, #+288] -ldr q11, [x17, #+304] -sqrdmulh v20.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v6.s[1] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v7.s[2] -mul v1.4S, v1.4S,v6.s[2] -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -str q2, [x0, #64] -str q20, [x0, #80] -str q21, [x0, #96] -str q17, [x0, #112] -ldr q17, [x0, #160] -ldr q21, [x0, #176] -ldr q20, [x0, #128] -ldr q2, [x0, #144] -sqrdmulh v1.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v8.s[0] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v9.s[0] -mul v21.4S, v21.4S,v8.s[0] -mla v21.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -ldr q21, [x17, #+320] -ldr q19, [x17, #+336] -sqrdmulh v18.4S, v2.4S, v9.s[1] -mul v2.4S, v2.4S,v8.s[1] -mla v2.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sqrdmulh v2.4S, v17.4S, v9.s[2] -mul v17.4S, v17.4S,v8.s[2] -mla v17.4S, v2.4S, v31.s[0] -sub v2.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -str q20, [x0, #128] -str q18, [x0, #144] -str q1, [x0, #160] -str q2, [x0, #176] -ldr q2, [x0, #224] -ldr q1, [x0, #240] -ldr q18, [x0, #192] -ldr q20, [x0, #208] -sqrdmulh v17.4S, v2.4S, v10.s[0] -mul v2.4S, v2.4S,v3.s[0] -mla v2.4S, v17.4S, v31.s[0] -sub v17.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v1.4S, v10.s[0] -mul v1.4S, v1.4S,v3.s[0] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -ldr q1, [x17, #+352] -ldr q14, [x17, #+368] -sqrdmulh v13.4S, v20.4S, v10.s[1] -mul v20.4S, v20.4S,v3.s[1] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v2.4S, v10.s[2] -mul v2.4S, v2.4S,v3.s[2] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -str q18, [x0, #192] -str q13, [x0, #208] -str q17, [x0, #224] -str q20, [x0, #240] -ldr q20, [x0, #288] -ldr q17, [x0, #304] -ldr q13, [x0, #256] -ldr q18, [x0, #272] -sqrdmulh v2.4S, v20.4S, v22.s[0] -mul v20.4S, v20.4S,v16.s[0] -mla v20.4S, v2.4S, v31.s[0] -sub v2.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v22.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -ldr q17, [x17, #+384] -ldr q0, [x17, #+400] -sqrdmulh v15.4S, v18.4S, v22.s[1] -mul v18.4S, v18.4S,v16.s[1] -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v22.s[2] -mul v20.4S, v20.4S,v16.s[2] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v20.4s -add v2.4s, v2.4s, v20.4s -str q13, [x0, #256] -str q15, [x0, #272] -str q2, [x0, #288] -str q18, [x0, #304] -ldr q5, [x0, #352] -ldr q4, [x0, #368] -ldr q18, [x0, #320] -ldr q2, [x0, #336] -sqrdmulh v15.4S, v5.4S, v11.s[0] -mul v5.4S, v5.4S,v12.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v5.4s -add v18.4s, v18.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v11.s[0] -mul v4.4S, v4.4S,v12.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -ldr q4, [x17, #+416] -ldr q13, [x17, #+432] -sqrdmulh v20.4S, v2.4S, v11.s[1] -mul v2.4S, v2.4S,v12.s[1] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sqrdmulh v2.4S, v5.4S, v11.s[2] -mul v5.4S, v5.4S,v12.s[2] -mla v5.4S, v2.4S, v31.s[0] -sub v2.4s, v15.4s, v5.4s -add v15.4s, v15.4s, v5.4s -str q18, [x0, #320] -str q20, [x0, #336] -str q15, [x0, #352] -str q2, [x0, #368] -ldr q7, [x0, #416] -ldr q6, [x0, #432] -ldr q2, [x0, #384] -ldr q15, [x0, #400] -sqrdmulh v20.4S, v7.4S, v19.s[0] -mul v7.4S, v7.4S,v21.s[0] -mla v7.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v6.4S, v19.s[0] -mul v6.4S, v6.4S,v21.s[0] -mla v6.4S, v7.4S, v31.s[0] -sub v7.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -ldr q6, [x17, #+448] -ldr q18, [x17, #+464] -sqrdmulh v5.4S, v15.4S, v19.s[1] -mul v15.4S, v15.4S,v21.s[1] -mla v15.4S, v5.4S, v31.s[0] -sub v5.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v7.4S, v19.s[2] -mul v7.4S, v7.4S,v21.s[2] -mla v7.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v7.4s -add v20.4s, v20.4s, v7.4s -str q2, [x0, #384] -str q5, [x0, #400] -str q20, [x0, #416] -str q15, [x0, #432] -ldr q9, [x0, #480] -ldr q8, [x0, #496] -ldr q15, [x0, #448] -ldr q20, [x0, #464] -sqrdmulh v5.4S, v9.4S, v14.s[0] -mul v9.4S, v9.4S,v1.s[0] -mla v9.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v14.s[0] -mul v8.4S, v8.4S,v1.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v8.4s -add v20.4s, v20.4s, v8.4s -ldr q8, [x17, #+480] -ldr q2, [x17, #+496] -sqrdmulh v7.4S, v20.4S, v14.s[1] -mul v20.4S, v20.4S,v1.s[1] -mla v20.4S, v7.4S, v31.s[0] -sub v7.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v14.s[2] -mul v9.4S, v9.4S,v1.s[2] -mla v9.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v9.4s -add v5.4s, v5.4s, v9.4s -str q15, [x0, #448] -str q7, [x0, #464] -str q5, [x0, #480] -str q20, [x0, #496] -ldr q10, [x0, #544] -ldr q3, [x0, #560] -ldr q20, [x0, #512] -ldr q5, [x0, #528] -sqrdmulh v7.4S, v10.4S, v0.s[0] -mul v10.4S, v10.4S,v17.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v17.s[0] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v3.4s -add v5.4s, v5.4s, v3.4s -ldr q3, [x17, #+512] -ldr q15, [x17, #+528] -sqrdmulh v9.4S, v5.4S, v0.s[1] -mul v5.4S, v5.4S,v17.s[1] -mla v5.4S, v9.4S, v31.s[0] -sub v9.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v0.s[2] -mul v10.4S, v10.4S,v17.s[2] -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -str q20, [x0, #512] -str q9, [x0, #528] -str q7, [x0, #544] -str q5, [x0, #560] -ldr q22, [x0, #608] -ldr q16, [x0, #624] -ldr q5, [x0, #576] -ldr q7, [x0, #592] -sqrdmulh v9.4S, v22.4S, v13.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v22.4s -add v5.4s, v5.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v13.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -ldr q16, [x17, #+544] -ldr q20, [x17, #+560] -sqrdmulh v10.4S, v7.4S, v13.s[1] -mul v7.4S, v7.4S,v4.s[1] -mla v7.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v22.4S, v13.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v7.4S, v31.s[0] -sub v7.4s, v9.4s, v22.4s -add v9.4s, v9.4s, v22.4s -str q5, [x0, #576] -str q10, [x0, #592] -str q9, [x0, #608] -str q7, [x0, #624] -ldr q11, [x0, #672] -ldr q12, [x0, #688] -ldr q7, [x0, #640] -ldr q9, [x0, #656] -sqrdmulh v10.4S, v11.4S, v18.s[0] -mul v11.4S, v11.4S,v6.s[0] -mla v11.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v11.4s -add v7.4s, v7.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v18.s[0] -mul v12.4S, v12.4S,v6.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -ldr q12, [x17, #+576] -ldr q5, [x17, #+592] -sqrdmulh v22.4S, v9.4S, v18.s[1] -mul v9.4S, v9.4S,v6.s[1] -mla v9.4S, v22.4S, v31.s[0] -sub v22.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v18.s[2] -mul v11.4S, v11.4S,v6.s[2] -mla v11.4S, v9.4S, v31.s[0] -sub v9.4s, v10.4s, v11.4s -add v10.4s, v10.4s, v11.4s -str q7, [x0, #640] -str q22, [x0, #656] -str q10, [x0, #672] -str q9, [x0, #688] -ldr q19, [x0, #736] -ldr q21, [x0, #752] -ldr q9, [x0, #704] -ldr q10, [x0, #720] -sqrdmulh v22.4S, v19.4S, v2.s[0] -mul v19.4S, v19.4S,v8.s[0] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v19.4s -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v2.s[0] -mul v21.4S, v21.4S,v8.s[0] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -ldr q21, [x17, #+608] -ldr q7, [x17, #+624] -sqrdmulh v11.4S, v10.4S, v2.s[1] -mul v10.4S, v10.4S,v8.s[1] -mla v10.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v2.s[2] -mul v19.4S, v19.4S,v8.s[2] -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v22.4s, v19.4s -add v22.4s, v22.4s, v19.4s -str q9, [x0, #704] -str q11, [x0, #720] -str q22, [x0, #736] -str q10, [x0, #752] -ldr q14, [x0, #800] -ldr q1, [x0, #816] -ldr q10, [x0, #768] -ldr q22, [x0, #784] -sqrdmulh v11.4S, v14.4S, v15.s[0] -mul v14.4S, v14.4S,v3.s[0] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v1.4S, v15.s[0] -mul v1.4S, v1.4S,v3.s[0] -mla v1.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v15.s[1] -mul v22.4S, v22.4S,v3.s[1] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sqrdmulh v22.4S, v14.4S, v15.s[2] -mul v14.4S, v14.4S,v3.s[2] -mla v14.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -str q10, [x0, #768] -str q1, [x0, #784] -str q11, [x0, #800] -str q22, [x0, #816] -ldr q0, [x0, #864] -ldr q17, [x0, #880] -ldr q22, [x0, #832] -ldr q11, [x0, #848] -sqrdmulh v1.4S, v0.4S, v20.s[0] -mul v0.4S, v0.4S,v16.s[0] -mla v0.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v20.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v20.s[1] -mul v11.4S, v11.4S,v16.s[1] -mla v11.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v0.4S, v20.s[2] -mul v0.4S, v0.4S,v16.s[2] -mla v0.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v0.4s -add v1.4s, v1.4s, v0.4s -str q22, [x0, #832] -str q17, [x0, #848] -str q1, [x0, #864] -str q11, [x0, #880] -ldr q13, [x0, #928] -ldr q4, [x0, #944] -ldr q11, [x0, #896] -ldr q1, [x0, #912] -sqrdmulh v17.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v13.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v5.s[0] -mul v4.4S, v4.4S,v12.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -sqrdmulh v4.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v12.s[1] -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v5.s[2] -mul v13.4S, v13.4S,v12.s[2] -mla v13.4S, v1.4S, v31.s[0] -sub v1.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -str q11, [x0, #896] -str q4, [x0, #912] -str q17, [x0, #928] -str q1, [x0, #944] -ldr q18, [x0, #992] -ldr q6, [x0, #1008] -ldr q1, [x0, #960] -ldr q17, [x0, #976] -sqrdmulh v4.4S, v18.4S, v7.s[0] -mul v18.4S, v18.4S,v21.s[0] -mla v18.4S, v4.4S, v31.s[0] -sub v4.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sqrdmulh v18.4S, v6.4S, v7.s[0] -mul v6.4S, v6.4S,v21.s[0] -mla v6.4S, v18.4S, v31.s[0] -sub v18.4s, v17.4s, v6.4s -add v17.4s, v17.4s, v6.4s -sqrdmulh v6.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v21.s[1] -mla v17.4S, v6.4S, v31.s[0] -sub v6.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v18.4S, v7.s[2] -mul v18.4S, v18.4S,v21.s[2] -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v4.4s, v18.4s -add v4.4s, v4.4s, v18.4s -str q1, [x0, #960] -str q6, [x0, #976] -str q4, [x0, #992] -str q17, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s deleted file mode 100644 index 70b872e..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_24_z4_16.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16 -.global _ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16 -ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16: -_ntt_u32_incomplete_neon_asm_var_4_2_24_z4_16: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -ldr q2, [x0, #544] -ldr q1, [x0, #608] -ldr q0, [x0, #672] -ldr q15, [x0, #736] -ldr q14, [x0, #32] -ldr q13, [x0, #96] -ldr q12, [x0, #160] -ldr q11, [x0, #224] -sqrdmulh v10.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -mla v22.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sub v22.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sub v21.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sub v20.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sub v19.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -sub v2.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sub v1.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sub v0.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -mla v17.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sub v16.4s, v11.4s, v3.4s -add v11.4s, v11.4s, v3.4s -sub v3.4s, v14.4s, v18.4s -add v14.4s, v14.4s, v18.4s -sub v18.4s, v13.4s, v17.4s -add v13.4s, v13.4s, v17.4s -sub v17.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sub v21.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sub v20.4s, v19.4s, v10.4s -add v19.4s, v19.4s, v10.4s -sub v10.4s, v2.4s, v22.4s -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v11.4S, v27.s[0] -mul v11.4S, v11.4S,v28.s[0] -mla v11.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -mla v16.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -mla v1.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -mla v17.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sub v12.4s, v13.4s, v11.4s -add v13.4s, v13.4s, v11.4s -sub v11.4s, v3.4s, v15.4s -add v3.4s, v3.4s, v15.4s -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sub v16.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sub v1.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sub v0.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sub v17.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v13.4S, v25.s[0] -mul v13.4S, v13.4S,v26.s[0] -mla v13.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v25.s[3] -mul v15.4S, v15.4S,v26.s[3] -mla v15.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v1.4S, v23.s[1] -mul v1.4S, v1.4S,v24.s[1] -mla v1.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v10.4S, v23.s[2] -mul v10.4S, v10.4S,v24.s[2] -mla v10.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -mla v17.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sub v13.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sub v12.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sub v18.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sub v15.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sub v2.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sub v1.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -sub v10.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -str q14, [x0, #32] -str q21, [x0, #96] -str q22, [x0, #160] -str q13, [x0, #224] -str q3, [x0, #288] -str q12, [x0, #352] -str q11, [x0, #416] -str q18, [x0, #480] -str q19, [x0, #544] -str q15, [x0, #608] -str q16, [x0, #672] -str q2, [x0, #736] -str q20, [x0, #800] -str q1, [x0, #864] -str q0, [x0, #928] -str q10, [x0, #992] -ldr q10, [x0, #816] -ldr q0, [x0, #880] -ldr q1, [x0, #944] -ldr q20, [x0, #1008] -ldr q2, [x0, #304] -ldr q16, [x0, #368] -ldr q15, [x0, #432] -ldr q19, [x0, #496] -ldr q18, [x0, #560] -ldr q11, [x0, #624] -ldr q12, [x0, #688] -ldr q3, [x0, #752] -ldr q13, [x0, #48] -ldr q22, [x0, #112] -ldr q21, [x0, #176] -ldr q14, [x0, #240] -sqrdmulh v17.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -mla v10.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v10.4s -add v2.4s, v2.4s, v10.4s -sub v10.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sub v0.4s, v15.4s, v1.4s -add v15.4s, v15.4s, v1.4s -sub v1.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -sub v20.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -sub v18.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sub v11.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -sub v12.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -mla v16.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -mla v0.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -mla v1.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sub v15.4s, v14.4s, v19.4s -add v14.4s, v14.4s, v19.4s -sub v19.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sub v2.4s, v22.4s, v16.4s -add v22.4s, v22.4s, v16.4s -sub v16.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sub v0.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sub v1.4s, v20.4s, v17.4s -add v20.4s, v20.4s, v17.4s -sub v17.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v21.4S, v27.s[0] -mul v21.4S, v21.4S,v28.s[0] -mla v21.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -mla v14.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -mla v15.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -mla v11.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -mla v12.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v16.4S, v27.s[3] -mul v16.4S, v16.4S,v28.s[3] -mla v16.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v0.4S, v27.s[3] -mul v0.4S, v0.4S,v28.s[3] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sub v21.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sub v14.4s, v19.4s, v3.4s -add v19.4s, v19.4s, v3.4s -sub v3.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sub v15.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sub v11.4s, v18.4s, v12.4s -add v18.4s, v18.4s, v12.4s -sub v12.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sub v16.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v22.4S, v25.s[0] -mul v22.4S, v22.4S,v26.s[0] -mla v22.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v21.4S, v25.s[1] -mul v21.4S, v21.4S,v26.s[1] -mla v21.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v3.4S, v25.s[3] -mul v3.4S, v3.4S,v26.s[3] -mla v3.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v11.4S, v23.s[1] -mul v11.4S, v11.4S,v24.s[1] -mla v11.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v17.4S, v23.s[2] -mul v17.4S, v17.4S,v24.s[2] -mla v17.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v16.4S, v23.s[3] -mul v16.4S, v16.4S,v24.s[3] -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v22.4s -add v13.4s, v13.4s, v22.4s -sub v22.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sub v21.4s, v19.4s, v2.4s -add v19.4s, v19.4s, v2.4s -sub v2.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sub v3.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sub v18.4s, v15.4s, v11.4s -add v15.4s, v15.4s, v11.4s -sub v11.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sub v17.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -str q13, [x0, #48] -str q0, [x0, #112] -str q10, [x0, #176] -str q22, [x0, #240] -str q19, [x0, #304] -str q21, [x0, #368] -str q14, [x0, #432] -str q2, [x0, #496] -str q20, [x0, #560] -str q3, [x0, #624] -str q15, [x0, #688] -str q18, [x0, #752] -str q1, [x0, #816] -str q11, [x0, #880] -str q12, [x0, #944] -str q17, [x0, #1008] -ldr q17, [x0, #768] -ldr q12, [x0, #832] -ldr q11, [x0, #896] -ldr q1, [x0, #960] -ldr q18, [x0, #256] -ldr q15, [x0, #320] -ldr q3, [x0, #384] -ldr q20, [x0, #448] -ldr q2, [x0, #512] -ldr q14, [x0, #576] -ldr q21, [x0, #640] -ldr q19, [x0, #704] -ldr q22, [x0, #0] -ldr q10, [x0, #64] -ldr q0, [x0, #128] -ldr q13, [x0, #192] -sqrdmulh v16.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -mla v17.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -mla v12.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -mla v1.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -mla v19.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -sub v17.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sub v12.4s, v3.4s, v11.4s -add v3.4s, v3.4s, v11.4s -sub v11.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sub v1.4s, v22.4s, v2.4s -add v22.4s, v22.4s, v2.4s -sub v2.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sub v14.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sub v21.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -mla v20.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -mla v18.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -mla v15.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -mla v12.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -mla v11.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -mla v17.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -sub v3.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -sub v20.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sub v18.4s, v10.4s, v15.4s -add v10.4s, v10.4s, v15.4s -sub v15.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -sub v12.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sub v11.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sub v16.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[0] -mul v0.4S, v0.4S,v28.s[0] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -mla v13.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -mla v3.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v14.4S, v27.s[2] -mul v14.4S, v14.4S,v28.s[2] -mla v14.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -mla v21.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -mla v15.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v0.4s -add v22.4s, v22.4s, v0.4s -sub v0.4s, v10.4s, v13.4s -add v10.4s, v10.4s, v13.4s -sub v13.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sub v19.4s, v18.4s, v3.4s -add v18.4s, v18.4s, v3.4s -sub v3.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -sub v14.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sub v21.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sub v15.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -mla v10.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v0.4S, v25.s[1] -mul v0.4S, v0.4S,v26.s[1] -mla v0.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v18.4S, v25.s[2] -mul v18.4S, v18.4S,v26.s[2] -mla v18.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v19.4S, v25.s[3] -mul v19.4S, v19.4S,v26.s[3] -mla v19.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -mla v2.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v14.4S, v23.s[1] -mul v14.4S, v14.4S,v24.s[1] -mla v14.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -mla v16.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v15.4S, v23.s[3] -mul v15.4S, v15.4S,v24.s[3] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sub v10.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sub v0.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -sub v18.4s, v13.4s, v19.4s -add v13.4s, v13.4s, v19.4s -sub v19.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sub v2.4s, v3.4s, v14.4s -add v3.4s, v3.4s, v14.4s -sub v14.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sub v16.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -str q22, [x0, #0] -str q12, [x0, #64] -str q17, [x0, #128] -str q10, [x0, #192] -str q20, [x0, #256] -str q0, [x0, #320] -str q13, [x0, #384] -str q18, [x0, #448] -str q1, [x0, #512] -str q19, [x0, #576] -str q3, [x0, #640] -str q2, [x0, #704] -str q11, [x0, #768] -str q14, [x0, #832] -str q21, [x0, #896] -str q16, [x0, #960] -ldr q16, [x0, #784] -ldr q21, [x0, #848] -ldr q14, [x0, #912] -ldr q11, [x0, #976] -ldr q2, [x0, #272] -ldr q3, [x0, #336] -ldr q19, [x0, #400] -ldr q1, [x0, #464] -ldr q18, [x0, #528] -ldr q13, [x0, #592] -ldr q0, [x0, #656] -ldr q20, [x0, #720] -ldr q10, [x0, #16] -ldr q17, [x0, #80] -ldr q12, [x0, #144] -ldr q22, [x0, #208] -sqrdmulh v15.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v16.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -mla v21.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -mla v14.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -mla v11.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -mla v18.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -mla v13.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v0.4S, v29.s[0] -mul v0.4S, v0.4S,v30.s[0] -mla v0.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sub v16.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -sub v21.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sub v14.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -sub v11.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sub v13.4s, v12.4s, v0.4s -add v12.4s, v12.4s, v0.4s -sub v0.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -mla v19.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -mla v1.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v30.s[1] -mla v2.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -mla v3.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -mla v21.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -mla v14.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -mla v15.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -mla v16.4S, v20.4S, v31.s[0] -sub v20.4s, v12.4s, v19.4s -add v12.4s, v12.4s, v19.4s -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sub v1.4s, v10.4s, v2.4s -add v10.4s, v10.4s, v2.4s -sub v2.4s, v17.4s, v3.4s -add v17.4s, v17.4s, v3.4s -sub v3.4s, v13.4s, v21.4s -add v13.4s, v13.4s, v21.4s -sub v21.4s, v0.4s, v14.4s -add v0.4s, v0.4s, v14.4s -sub v14.4s, v11.4s, v15.4s -add v11.4s, v11.4s, v15.4s -sub v15.4s, v18.4s, v16.4s -add v18.4s, v18.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -mla v12.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -mla v19.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -mla v13.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -mla v0.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -mla v3.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -sub v12.4s, v17.4s, v22.4s -add v17.4s, v17.4s, v22.4s -sub v22.4s, v1.4s, v20.4s -add v1.4s, v1.4s, v20.4s -sub v20.4s, v2.4s, v19.4s -add v2.4s, v2.4s, v19.4s -sub v19.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -sub v13.4s, v18.4s, v0.4s -add v18.4s, v18.4s, v0.4s -sub v0.4s, v14.4s, v3.4s -add v14.4s, v14.4s, v3.4s -sub v3.4s, v15.4s, v21.4s -add v15.4s, v15.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v25.s[0] -mul v17.4S, v17.4S,v26.s[0] -mla v17.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -mla v12.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v2.4S, v25.s[2] -mul v2.4S, v2.4S,v26.s[2] -mla v2.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v20.4S, v25.s[3] -mul v20.4S, v20.4S,v26.s[3] -mla v20.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v18.4S, v23.s[0] -mul v18.4S, v18.4S,v24.s[0] -mla v18.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -mla v13.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -mla v15.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -mla v3.4S, v21.4S, v31.s[0] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sub v17.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sub v12.4s, v1.4s, v2.4s -add v1.4s, v1.4s, v2.4s -sub v2.4s, v22.4s, v20.4s -add v22.4s, v22.4s, v20.4s -sub v20.4s, v11.4s, v18.4s -add v11.4s, v11.4s, v18.4s -sub v18.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sub v13.4s, v14.4s, v15.4s -add v14.4s, v14.4s, v15.4s -sub v15.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -str q10, [x0, #16] -str q21, [x0, #80] -str q16, [x0, #144] -str q17, [x0, #208] -str q1, [x0, #272] -str q12, [x0, #336] -str q22, [x0, #400] -str q2, [x0, #464] -str q11, [x0, #528] -str q20, [x0, #592] -str q19, [x0, #656] -str q18, [x0, #720] -str q14, [x0, #784] -str q13, [x0, #848] -str q0, [x0, #912] -str q15, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q8, [x17, #+192] -ldr q9, [x17, #+208] -ldr q3, [x17, #+224] -ldr q10, [x17, #+240] -ldr q21, [x0, #32] -ldr q16, [x0, #48] -ldr q17, [x0, #0] -ldr q1, [x0, #16] -ldr q12, [x17, #+256] -ldr q22, [x17, #+272] -sqrdmulh v2.4S, v21.4S, v5.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v4.s[0] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sub v21.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v4.s[1] -mla v1.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v21.4S, v5.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v16.4S, v31.s[0] -sub v16.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -sub v1.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -str q17, [x0, #0] -str q16, [x0, #16] -str q2, [x0, #32] -str q1, [x0, #48] -ldr q1, [x0, #96] -ldr q2, [x0, #112] -ldr q16, [x0, #64] -ldr q17, [x0, #80] -ldr q21, [x17, #+288] -ldr q11, [x17, #+304] -sqrdmulh v20.4S, v1.4S, v7.s[0] -mul v1.4S, v1.4S,v6.s[0] -mla v1.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v2.4S, v7.s[0] -mul v2.4S, v2.4S,v6.s[0] -mla v2.4S, v20.4S, v31.s[0] -sub v20.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -sub v1.4s, v17.4s, v2.4s -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v6.s[1] -mla v17.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v1.4S, v7.s[2] -mul v1.4S, v1.4S,v6.s[2] -mla v1.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sub v17.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -str q16, [x0, #64] -str q2, [x0, #80] -str q20, [x0, #96] -str q17, [x0, #112] -ldr q17, [x0, #160] -ldr q20, [x0, #176] -ldr q2, [x0, #128] -ldr q16, [x0, #144] -ldr q1, [x17, #+320] -ldr q19, [x17, #+336] -sqrdmulh v18.4S, v17.4S, v9.s[0] -mul v17.4S, v17.4S,v8.s[0] -mla v17.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v20.4S, v9.s[0] -mul v20.4S, v20.4S,v8.s[0] -mla v20.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v17.4s -add v2.4s, v2.4s, v17.4s -sub v17.4s, v16.4s, v20.4s -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v8.s[1] -mla v16.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v17.4S, v9.s[2] -mul v17.4S, v17.4S,v8.s[2] -mla v17.4S, v20.4S, v31.s[0] -sub v20.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sub v16.4s, v18.4s, v17.4s -add v18.4s, v18.4s, v17.4s -str q2, [x0, #128] -str q20, [x0, #144] -str q18, [x0, #160] -str q16, [x0, #176] -ldr q16, [x0, #224] -ldr q18, [x0, #240] -ldr q20, [x0, #192] -ldr q2, [x0, #208] -ldr q17, [x17, #+352] -ldr q14, [x17, #+368] -sqrdmulh v13.4S, v16.4S, v10.s[0] -mul v16.4S, v16.4S,v3.s[0] -mla v16.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v18.4S, v10.s[0] -mul v18.4S, v18.4S,v3.s[0] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sub v16.4s, v2.4s, v18.4s -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v10.s[1] -mul v2.4S, v2.4S,v3.s[1] -mla v2.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v16.4S, v10.s[2] -mul v16.4S, v16.4S,v3.s[2] -mla v16.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v2.4s -add v20.4s, v20.4s, v2.4s -sub v2.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -str q20, [x0, #192] -str q18, [x0, #208] -str q13, [x0, #224] -str q2, [x0, #240] -ldr q2, [x0, #288] -ldr q13, [x0, #304] -ldr q18, [x0, #256] -ldr q20, [x0, #272] -ldr q16, [x17, #+384] -ldr q0, [x17, #+400] -sqrdmulh v15.4S, v2.4S, v22.s[0] -mul v2.4S, v2.4S,v12.s[0] -mla v2.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v13.4S, v22.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v13.4S, v15.4S, v31.s[0] -sub v15.4s, v18.4s, v2.4s -add v18.4s, v18.4s, v2.4s -sub v2.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v22.s[1] -mul v20.4S, v20.4S,v12.s[1] -mla v20.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v2.4S, v22.s[2] -mul v2.4S, v2.4S,v12.s[2] -mla v2.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sub v20.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -str q18, [x0, #256] -str q13, [x0, #272] -str q15, [x0, #288] -str q20, [x0, #304] -ldr q5, [x0, #352] -ldr q4, [x0, #368] -ldr q20, [x0, #320] -ldr q15, [x0, #336] -ldr q13, [x17, #+416] -ldr q18, [x17, #+432] -sqrdmulh v2.4S, v5.4S, v11.s[0] -mul v5.4S, v5.4S,v21.s[0] -mla v5.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v4.4S, v11.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sub v5.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v15.4S, v11.s[1] -mul v15.4S, v15.4S,v21.s[1] -mla v15.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v5.4S, v11.s[2] -mul v5.4S, v5.4S,v21.s[2] -mla v5.4S, v4.4S, v31.s[0] -sub v4.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -sub v15.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -str q20, [x0, #320] -str q4, [x0, #336] -str q2, [x0, #352] -str q15, [x0, #368] -ldr q7, [x0, #416] -ldr q6, [x0, #432] -ldr q15, [x0, #384] -ldr q2, [x0, #400] -ldr q4, [x17, #+448] -ldr q20, [x17, #+464] -sqrdmulh v5.4S, v7.4S, v19.s[0] -mul v7.4S, v7.4S,v1.s[0] -mla v7.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v6.4S, v19.s[0] -mul v6.4S, v6.4S,v1.s[0] -mla v6.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v7.4s -add v15.4s, v15.4s, v7.4s -sub v7.4s, v2.4s, v6.4s -add v2.4s, v2.4s, v6.4s -sqrdmulh v6.4S, v2.4S, v19.s[1] -mul v2.4S, v2.4S,v1.s[1] -mla v2.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v7.4S, v19.s[2] -mul v7.4S, v7.4S,v1.s[2] -mla v7.4S, v6.4S, v31.s[0] -sub v6.4s, v15.4s, v2.4s -add v15.4s, v15.4s, v2.4s -sub v2.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -str q15, [x0, #384] -str q6, [x0, #400] -str q5, [x0, #416] -str q2, [x0, #432] -ldr q9, [x0, #480] -ldr q8, [x0, #496] -ldr q2, [x0, #448] -ldr q5, [x0, #464] -ldr q6, [x17, #+480] -ldr q15, [x17, #+496] -sqrdmulh v7.4S, v9.4S, v14.s[0] -mul v9.4S, v9.4S,v17.s[0] -mla v9.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v8.4S, v14.s[0] -mul v8.4S, v8.4S,v17.s[0] -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v9.4s -add v2.4s, v2.4s, v9.4s -sub v9.4s, v5.4s, v8.4s -add v5.4s, v5.4s, v8.4s -sqrdmulh v8.4S, v5.4S, v14.s[1] -mul v5.4S, v5.4S,v17.s[1] -mla v5.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v9.4S, v14.s[2] -mul v9.4S, v9.4S,v17.s[2] -mla v9.4S, v8.4S, v31.s[0] -sub v8.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -sub v5.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -str q2, [x0, #448] -str q8, [x0, #464] -str q7, [x0, #480] -str q5, [x0, #496] -ldr q10, [x0, #544] -ldr q3, [x0, #560] -ldr q5, [x0, #512] -ldr q7, [x0, #528] -ldr q8, [x17, #+512] -ldr q2, [x17, #+528] -sqrdmulh v9.4S, v10.4S, v0.s[0] -mul v10.4S, v10.4S,v16.s[0] -mla v10.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v3.4S, v0.s[0] -mul v3.4S, v3.4S,v16.s[0] -mla v3.4S, v9.4S, v31.s[0] -sub v9.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -sub v10.4s, v7.4s, v3.4s -add v7.4s, v7.4s, v3.4s -sqrdmulh v3.4S, v7.4S, v0.s[1] -mul v7.4S, v7.4S,v16.s[1] -mla v7.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v10.4S, v0.s[2] -mul v10.4S, v10.4S,v16.s[2] -mla v10.4S, v3.4S, v31.s[0] -sub v3.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sub v7.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -str q5, [x0, #512] -str q3, [x0, #528] -str q9, [x0, #544] -str q7, [x0, #560] -ldr q22, [x0, #608] -ldr q12, [x0, #624] -ldr q7, [x0, #576] -ldr q9, [x0, #592] -ldr q3, [x17, #+544] -ldr q5, [x17, #+560] -sqrdmulh v10.4S, v22.4S, v18.s[0] -mul v22.4S, v22.4S,v13.s[0] -mla v22.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v12.4S, v18.s[0] -mul v12.4S, v12.4S,v13.s[0] -mla v12.4S, v10.4S, v31.s[0] -sub v10.4s, v7.4s, v22.4s -add v7.4s, v7.4s, v22.4s -sub v22.4s, v9.4s, v12.4s -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v9.4S, v18.s[1] -mul v9.4S, v9.4S,v13.s[1] -mla v9.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v22.4S, v18.s[2] -mul v22.4S, v22.4S,v13.s[2] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v9.4s -add v7.4s, v7.4s, v9.4s -sub v9.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -str q7, [x0, #576] -str q12, [x0, #592] -str q10, [x0, #608] -str q9, [x0, #624] -ldr q11, [x0, #672] -ldr q21, [x0, #688] -ldr q9, [x0, #640] -ldr q10, [x0, #656] -ldr q12, [x17, #+576] -ldr q7, [x17, #+592] -sqrdmulh v22.4S, v11.4S, v20.s[0] -mul v11.4S, v11.4S,v4.s[0] -mla v11.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v21.4S, v20.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v11.4s -add v9.4s, v9.4s, v11.4s -sub v11.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v20.s[1] -mul v10.4S, v10.4S,v4.s[1] -mla v10.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v11.4S, v20.s[2] -mul v11.4S, v11.4S,v4.s[2] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v10.4s -add v9.4s, v9.4s, v10.4s -sub v10.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -str q9, [x0, #640] -str q21, [x0, #656] -str q22, [x0, #672] -str q10, [x0, #688] -ldr q19, [x0, #736] -ldr q1, [x0, #752] -ldr q10, [x0, #704] -ldr q22, [x0, #720] -ldr q21, [x17, #+608] -ldr q9, [x17, #+624] -sqrdmulh v11.4S, v19.4S, v15.s[0] -mul v19.4S, v19.4S,v6.s[0] -mla v19.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v1.4S, v15.s[0] -mul v1.4S, v1.4S,v6.s[0] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v10.4s, v19.4s -add v10.4s, v10.4s, v19.4s -sub v19.4s, v22.4s, v1.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v15.s[1] -mul v22.4S, v22.4S,v6.s[1] -mla v22.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v19.4S, v15.s[2] -mul v19.4S, v19.4S,v6.s[2] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v22.4s -add v10.4s, v10.4s, v22.4s -sub v22.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -str q10, [x0, #704] -str q1, [x0, #720] -str q11, [x0, #736] -str q22, [x0, #752] -ldr q14, [x0, #800] -ldr q17, [x0, #816] -ldr q22, [x0, #768] -ldr q11, [x0, #784] -sqrdmulh v1.4S, v14.4S, v2.s[0] -mul v14.4S, v14.4S,v8.s[0] -mla v14.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v8.s[0] -mla v17.4S, v1.4S, v31.s[0] -sub v1.4s, v22.4s, v14.4s -add v22.4s, v22.4s, v14.4s -sub v14.4s, v11.4s, v17.4s -add v11.4s, v11.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v2.s[1] -mul v11.4S, v11.4S,v8.s[1] -mla v11.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v14.4S, v2.s[2] -mul v14.4S, v14.4S,v8.s[2] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sub v11.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -str q22, [x0, #768] -str q17, [x0, #784] -str q1, [x0, #800] -str q11, [x0, #816] -ldr q0, [x0, #864] -ldr q16, [x0, #880] -ldr q11, [x0, #832] -ldr q1, [x0, #848] -sqrdmulh v17.4S, v0.4S, v5.s[0] -mul v0.4S, v0.4S,v3.s[0] -mla v0.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v16.4S, v5.s[0] -mul v16.4S, v16.4S,v3.s[0] -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v11.4s, v0.4s -add v11.4s, v11.4s, v0.4s -sub v0.4s, v1.4s, v16.4s -add v1.4s, v1.4s, v16.4s -sqrdmulh v16.4S, v1.4S, v5.s[1] -mul v1.4S, v1.4S,v3.s[1] -mla v1.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v0.4S, v5.s[2] -mul v0.4S, v0.4S,v3.s[2] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v1.4s -add v11.4s, v11.4s, v1.4s -sub v1.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -str q11, [x0, #832] -str q16, [x0, #848] -str q17, [x0, #864] -str q1, [x0, #880] -ldr q18, [x0, #928] -ldr q13, [x0, #944] -ldr q1, [x0, #896] -ldr q17, [x0, #912] -sqrdmulh v16.4S, v18.4S, v7.s[0] -mul v18.4S, v18.4S,v12.s[0] -mla v18.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v13.4S, v7.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v1.4s, v18.4s -add v1.4s, v1.4s, v18.4s -sub v18.4s, v17.4s, v13.4s -add v17.4s, v17.4s, v13.4s -sqrdmulh v13.4S, v17.4S, v7.s[1] -mul v17.4S, v17.4S,v12.s[1] -mla v17.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v18.4S, v7.s[2] -mul v18.4S, v18.4S,v12.s[2] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v17.4s -add v1.4s, v1.4s, v17.4s -sub v17.4s, v16.4s, v18.4s -add v16.4s, v16.4s, v18.4s -str q1, [x0, #896] -str q13, [x0, #912] -str q16, [x0, #928] -str q17, [x0, #944] -ldr q20, [x0, #992] -ldr q4, [x0, #1008] -ldr q17, [x0, #960] -ldr q16, [x0, #976] -sqrdmulh v13.4S, v20.4S, v9.s[0] -mul v20.4S, v20.4S,v21.s[0] -mla v20.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v4.4S, v9.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -sub v20.4s, v16.4s, v4.4s -add v16.4s, v16.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v21.s[1] -mla v16.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v20.4S, v9.s[2] -mul v20.4S, v20.4S,v21.s[2] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -sub v16.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -str q17, [x0, #960] -str q4, [x0, #976] -str q13, [x0, #992] -str q16, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s deleted file mode 100644 index b9ef60e..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_0.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0 -.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0 -ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0: -_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #16] -sqrdmulh v14.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -ldr q13, [x17, #+256] -ldr q0, [x17, #+272] -sqrdmulh v19.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v4.s[1] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v4.s[2] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -str q11, [x0, #0] -str q19, [x0, #16] -str q14, [x0, #32] -str q21, [x0, #48] -ldr q21, [x0, #96] -ldr q14, [x0, #112] -ldr q19, [x0, #64] -ldr q11, [x0, #80] -sqrdmulh v22.4S, v21.4S, v7.s[0] -mul v21.4S, v21.4S,v6.s[0] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v21.4s -add v19.4s, v19.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v7.s[0] -mul v14.4S, v14.4S,v6.s[0] -mla v14.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -ldr q14, [x17, #+288] -ldr q17, [x17, #+304] -sqrdmulh v20.4S, v11.4S, v7.s[1] -mul v11.4S, v11.4S,v6.s[1] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v11.4s -add v19.4s, v19.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v7.s[2] -mul v21.4S, v21.4S,v6.s[2] -mla v21.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v21.4s -add v22.4s, v22.4s, v21.4s -str q19, [x0, #64] -str q20, [x0, #80] -str q22, [x0, #96] -str q11, [x0, #112] -ldr q11, [x0, #160] -ldr q22, [x0, #176] -ldr q20, [x0, #128] -ldr q19, [x0, #144] -sqrdmulh v21.4S, v11.4S, v10.s[0] -mul v11.4S, v11.4S,v15.s[0] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v10.s[0] -mul v22.4S, v22.4S,v15.s[0] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -ldr q22, [x17, #+320] -ldr q3, [x17, #+336] -sqrdmulh v1.4S, v19.4S, v10.s[1] -mul v19.4S, v19.4S,v15.s[1] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -sqrdmulh v19.4S, v11.4S, v10.s[2] -mul v11.4S, v11.4S,v15.s[2] -mla v11.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -str q20, [x0, #128] -str q1, [x0, #144] -str q21, [x0, #160] -str q19, [x0, #176] -ldr q19, [x0, #224] -ldr q21, [x0, #240] -ldr q1, [x0, #192] -ldr q20, [x0, #208] -sqrdmulh v11.4S, v19.4S, v16.s[0] -mul v19.4S, v19.4S,v2.s[0] -mla v19.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v19.4s -add v1.4s, v1.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v16.s[0] -mul v21.4S, v21.4S,v2.s[0] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -ldr q21, [x17, #+352] -ldr q9, [x17, #+368] -sqrdmulh v12.4S, v20.4S, v16.s[1] -mul v20.4S, v20.4S,v2.s[1] -mla v20.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v20.4s -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v16.s[2] -mul v19.4S, v19.4S,v2.s[2] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v19.4s -add v11.4s, v11.4s, v19.4s -str q1, [x0, #192] -str q12, [x0, #208] -str q11, [x0, #224] -str q20, [x0, #240] -ldr q20, [x0, #288] -ldr q11, [x0, #304] -ldr q12, [x0, #256] -ldr q1, [x0, #272] -sqrdmulh v19.4S, v20.4S, v0.s[0] -mul v20.4S, v20.4S,v13.s[0] -mla v20.4S, v19.4S, v31.s[0] -sub v19.4s, v12.4s, v20.4s -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v11.4S, v0.s[0] -mul v11.4S, v11.4S,v13.s[0] -mla v11.4S, v20.4S, v31.s[0] -sub v20.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -ldr q11, [x17, #+384] -ldr q8, [x17, #+400] -sqrdmulh v18.4S, v1.4S, v0.s[1] -mul v1.4S, v1.4S,v13.s[1] -mla v1.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v1.4s -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v0.s[2] -mul v20.4S, v20.4S,v13.s[2] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v20.4s -add v19.4s, v19.4s, v20.4s -str q12, [x0, #256] -str q18, [x0, #272] -str q19, [x0, #288] -str q1, [x0, #304] -ldr q5, [x0, #352] -ldr q4, [x0, #368] -ldr q1, [x0, #320] -ldr q19, [x0, #336] -sqrdmulh v18.4S, v5.4S, v17.s[0] -mul v5.4S, v5.4S,v14.s[0] -mla v5.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v5.4s -add v1.4s, v1.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v17.s[0] -mul v4.4S, v4.4S,v14.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -ldr q4, [x17, #+416] -ldr q12, [x17, #+432] -sqrdmulh v20.4S, v19.4S, v17.s[1] -mul v19.4S, v19.4S,v14.s[1] -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v1.4s, v19.4s -add v1.4s, v1.4s, v19.4s -sqrdmulh v19.4S, v5.4S, v17.s[2] -mul v5.4S, v5.4S,v14.s[2] -mla v5.4S, v19.4S, v31.s[0] -sub v19.4s, v18.4s, v5.4s -add v18.4s, v18.4s, v5.4s -str q1, [x0, #320] -str q20, [x0, #336] -str q18, [x0, #352] -str q19, [x0, #368] -ldr q7, [x0, #416] -ldr q6, [x0, #432] -ldr q19, [x0, #384] -ldr q18, [x0, #400] -sqrdmulh v20.4S, v7.4S, v3.s[0] -mul v7.4S, v7.4S,v22.s[0] -mla v7.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v7.4s -add v19.4s, v19.4s, v7.4s -sqrdmulh v7.4S, v6.4S, v3.s[0] -mul v6.4S, v6.4S,v22.s[0] -mla v6.4S, v7.4S, v31.s[0] -sub v7.4s, v18.4s, v6.4s -add v18.4s, v18.4s, v6.4s -ldr q6, [x17, #+448] -ldr q1, [x17, #+464] -sqrdmulh v5.4S, v18.4S, v3.s[1] -mul v18.4S, v18.4S,v22.s[1] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v18.4s -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v7.4S, v3.s[2] -mul v7.4S, v7.4S,v22.s[2] -mla v7.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v7.4s -add v20.4s, v20.4s, v7.4s -str q19, [x0, #384] -str q5, [x0, #400] -str q20, [x0, #416] -str q18, [x0, #432] -ldr q10, [x0, #480] -ldr q15, [x0, #496] -ldr q18, [x0, #448] -ldr q20, [x0, #464] -sqrdmulh v5.4S, v10.4S, v9.s[0] -mul v10.4S, v10.4S,v21.s[0] -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v18.4s, v10.4s -add v18.4s, v18.4s, v10.4s -sqrdmulh v10.4S, v15.4S, v9.s[0] -mul v15.4S, v15.4S,v21.s[0] -mla v15.4S, v10.4S, v31.s[0] -sub v10.4s, v20.4s, v15.4s -add v20.4s, v20.4s, v15.4s -ldr q15, [x17, #+480] -ldr q19, [x17, #+496] -sqrdmulh v7.4S, v20.4S, v9.s[1] -mul v20.4S, v20.4S,v21.s[1] -mla v20.4S, v7.4S, v31.s[0] -sub v7.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v9.s[2] -mul v10.4S, v10.4S,v21.s[2] -mla v10.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v10.4s -add v5.4s, v5.4s, v10.4s -str q18, [x0, #448] -str q7, [x0, #464] -str q5, [x0, #480] -str q20, [x0, #496] -ldr q16, [x0, #544] -ldr q2, [x0, #560] -ldr q20, [x0, #512] -ldr q5, [x0, #528] -sqrdmulh v7.4S, v16.4S, v8.s[0] -mul v16.4S, v16.4S,v11.s[0] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -sqrdmulh v16.4S, v2.4S, v8.s[0] -mul v2.4S, v2.4S,v11.s[0] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v5.4s, v2.4s -add v5.4s, v5.4s, v2.4s -ldr q2, [x17, #+512] -ldr q18, [x17, #+528] -sqrdmulh v10.4S, v5.4S, v8.s[1] -mul v5.4S, v5.4S,v11.s[1] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v20.4s, v5.4s -add v20.4s, v20.4s, v5.4s -sqrdmulh v5.4S, v16.4S, v8.s[2] -mul v16.4S, v16.4S,v11.s[2] -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v7.4s, v16.4s -add v7.4s, v7.4s, v16.4s -str q20, [x0, #512] -str q10, [x0, #528] -str q7, [x0, #544] -str q5, [x0, #560] -ldr q0, [x0, #608] -ldr q13, [x0, #624] -ldr q5, [x0, #576] -ldr q7, [x0, #592] -sqrdmulh v10.4S, v0.4S, v12.s[0] -mul v0.4S, v0.4S,v4.s[0] -mla v0.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v0.4s -add v5.4s, v5.4s, v0.4s -sqrdmulh v0.4S, v13.4S, v12.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v7.4s, v13.4s -add v7.4s, v7.4s, v13.4s -ldr q13, [x17, #+544] -ldr q20, [x17, #+560] -sqrdmulh v16.4S, v7.4S, v12.s[1] -mul v7.4S, v7.4S,v4.s[1] -mla v7.4S, v16.4S, v31.s[0] -sub v16.4s, v5.4s, v7.4s -add v5.4s, v5.4s, v7.4s -sqrdmulh v7.4S, v0.4S, v12.s[2] -mul v0.4S, v0.4S,v4.s[2] -mla v0.4S, v7.4S, v31.s[0] -sub v7.4s, v10.4s, v0.4s -add v10.4s, v10.4s, v0.4s -str q5, [x0, #576] -str q16, [x0, #592] -str q10, [x0, #608] -str q7, [x0, #624] -ldr q17, [x0, #672] -ldr q14, [x0, #688] -ldr q7, [x0, #640] -ldr q10, [x0, #656] -sqrdmulh v16.4S, v17.4S, v1.s[0] -mul v17.4S, v17.4S,v6.s[0] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v7.4s, v17.4s -add v7.4s, v7.4s, v17.4s -sqrdmulh v17.4S, v14.4S, v1.s[0] -mul v14.4S, v14.4S,v6.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -ldr q14, [x17, #+576] -ldr q5, [x17, #+592] -sqrdmulh v0.4S, v10.4S, v1.s[1] -mul v10.4S, v10.4S,v6.s[1] -mla v10.4S, v0.4S, v31.s[0] -sub v0.4s, v7.4s, v10.4s -add v7.4s, v7.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v1.s[2] -mul v17.4S, v17.4S,v6.s[2] -mla v17.4S, v10.4S, v31.s[0] -sub v10.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -str q7, [x0, #640] -str q0, [x0, #656] -str q16, [x0, #672] -str q10, [x0, #688] -ldr q3, [x0, #736] -ldr q22, [x0, #752] -ldr q10, [x0, #704] -ldr q16, [x0, #720] -sqrdmulh v0.4S, v3.4S, v19.s[0] -mul v3.4S, v3.4S,v15.s[0] -mla v3.4S, v0.4S, v31.s[0] -sub v0.4s, v10.4s, v3.4s -add v10.4s, v10.4s, v3.4s -sqrdmulh v3.4S, v22.4S, v19.s[0] -mul v22.4S, v22.4S,v15.s[0] -mla v22.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v22.4s -add v16.4s, v16.4s, v22.4s -ldr q22, [x17, #+608] -ldr q7, [x17, #+624] -sqrdmulh v17.4S, v16.4S, v19.s[1] -mul v16.4S, v16.4S,v15.s[1] -mla v16.4S, v17.4S, v31.s[0] -sub v17.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v3.4S, v19.s[2] -mul v3.4S, v3.4S,v15.s[2] -mla v3.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v3.4s -add v0.4s, v0.4s, v3.4s -str q10, [x0, #704] -str q17, [x0, #720] -str q0, [x0, #736] -str q16, [x0, #752] -ldr q9, [x0, #800] -ldr q21, [x0, #816] -ldr q16, [x0, #768] -ldr q0, [x0, #784] -sqrdmulh v17.4S, v9.4S, v18.s[0] -mul v9.4S, v9.4S,v2.s[0] -mla v9.4S, v17.4S, v31.s[0] -sub v17.4s, v16.4s, v9.4s -add v16.4s, v16.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v18.s[0] -mul v21.4S, v21.4S,v2.s[0] -mla v21.4S, v9.4S, v31.s[0] -sub v9.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v18.s[1] -mul v0.4S, v0.4S,v2.s[1] -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v0.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v18.s[2] -mul v9.4S, v9.4S,v2.s[2] -mla v9.4S, v0.4S, v31.s[0] -sub v0.4s, v17.4s, v9.4s -add v17.4s, v17.4s, v9.4s -str q16, [x0, #768] -str q21, [x0, #784] -str q17, [x0, #800] -str q0, [x0, #816] -ldr q8, [x0, #864] -ldr q11, [x0, #880] -ldr q0, [x0, #832] -ldr q17, [x0, #848] -sqrdmulh v21.4S, v8.4S, v20.s[0] -mul v8.4S, v8.4S,v13.s[0] -mla v8.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v11.4S, v20.s[0] -mul v11.4S, v11.4S,v13.s[0] -mla v11.4S, v8.4S, v31.s[0] -sub v8.4s, v17.4s, v11.4s -add v17.4s, v17.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v20.s[1] -mul v17.4S, v17.4S,v13.s[1] -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v8.4S, v20.s[2] -mul v8.4S, v8.4S,v13.s[2] -mla v8.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v8.4s -add v21.4s, v21.4s, v8.4s -str q0, [x0, #832] -str q11, [x0, #848] -str q21, [x0, #864] -str q17, [x0, #880] -ldr q12, [x0, #928] -ldr q4, [x0, #944] -ldr q17, [x0, #896] -ldr q21, [x0, #912] -sqrdmulh v11.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v14.s[0] -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v17.4s, v12.4s -add v17.4s, v17.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v5.s[0] -mul v4.4S, v4.4S,v14.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v4.4s -add v21.4s, v21.4s, v4.4s -sqrdmulh v4.4S, v21.4S, v5.s[1] -mul v21.4S, v21.4S,v14.s[1] -mla v21.4S, v4.4S, v31.s[0] -sub v4.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v5.s[2] -mul v12.4S, v12.4S,v14.s[2] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -str q17, [x0, #896] -str q4, [x0, #912] -str q11, [x0, #928] -str q21, [x0, #944] -ldr q1, [x0, #992] -ldr q6, [x0, #1008] -ldr q21, [x0, #960] -ldr q11, [x0, #976] -sqrdmulh v4.4S, v1.4S, v7.s[0] -mul v1.4S, v1.4S,v22.s[0] -mla v1.4S, v4.4S, v31.s[0] -sub v4.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v6.4S, v7.s[0] -mul v6.4S, v6.4S,v22.s[0] -mla v6.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -sqrdmulh v6.4S, v11.4S, v7.s[1] -mul v11.4S, v11.4S,v22.s[1] -mla v11.4S, v6.4S, v31.s[0] -sub v6.4s, v21.4s, v11.4s -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v7.s[2] -mul v1.4S, v1.4S,v22.s[2] -mla v1.4S, v11.4S, v31.s[0] -sub v11.4s, v4.4s, v1.4s -add v4.4s, v4.4s, v1.4s -str q21, [x0, #960] -str q6, [x0, #976] -str q4, [x0, #992] -str q11, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s deleted file mode 100644 index 6da50a5..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_1.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1 -.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1 -ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1: -_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_1: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #32] -ldr q5, [x0, #48] -ldr q6, [x0, #0] -ldr q7, [x0, #16] -ldr q15, [x0, #96] -ldr q10, [x0, #112] -ldr q2, [x0, #64] -ldr q16, [x0, #80] -ldr q22, [x0, #160] -ldr q13, [x0, #176] -ldr q11, [x0, #128] -ldr q21, [x0, #144] -ldr q14, [x0, #224] -ldr q0, [x0, #240] -ldr q19, [x0, #192] -ldr q17, [x0, #208] -ldr q20, [x17, #+128] -ldr q3, [x17, #+144] -ldr q1, [x17, #+160] -ldr q9, [x17, #+176] -ldr q12, [x17, #+192] -ldr q8, [x17, #+208] -ldr q18, [x17, #+224] -ldr q30, [x17, #+240] -sqrdmulh v29.4S, v4.4S, v3.s[0] -mul v4.4S, v4.4S,v20.s[0] -sqrdmulh v28.4S, v5.4S, v3.s[0] -mul v5.4S, v5.4S,v20.s[0] -mla v4.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v15.4S, v9.s[0] -mul v15.4S, v15.4S,v1.s[0] -mla v5.4S, v28.4S, v31.s[0] -sub v28.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -sqrdmulh v4.4S, v10.4S, v9.s[0] -mul v10.4S, v10.4S,v1.s[0] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v7.4s, v5.4s -add v7.4s, v7.4s, v5.4s -sqrdmulh v5.4S, v7.4S, v3.s[1] -mul v7.4S, v7.4S,v20.s[1] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v29.4S, v3.s[2] -mul v29.4S, v29.4S,v20.s[2] -mla v7.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v10.4s -add v16.4s, v16.4s, v10.4s -sqrdmulh v10.4S, v16.4S, v9.s[1] -mul v16.4S, v16.4S,v1.s[1] -mla v29.4S, v15.4S, v31.s[0] -sub v15.4s, v6.4s, v7.4s -add v6.4s, v6.4s, v7.4s -sqrdmulh v3.4S, v5.4S, v9.s[2] -mul v5.4S, v5.4S,v1.s[2] -mla v16.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v22.4S, v8.s[0] -mul v22.4S, v22.4S,v12.s[0] -mla v5.4S, v3.4S, v31.s[0] -sub v3.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v9.4S, v13.4S, v8.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v22.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v5.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v5.4S, v14.4S, v30.s[0] -mul v14.4S, v14.4S,v18.s[0] -mla v13.4S, v9.4S, v31.s[0] -sub v9.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v0.4S, v30.s[0] -mul v0.4S, v0.4S,v18.s[0] -mla v14.4S, v5.4S, v31.s[0] -sub v5.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v21.4S, v8.s[1] -mul v21.4S, v21.4S,v12.s[1] -mla v0.4S, v22.4S, v31.s[0] -sub v22.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v5.4S, v8.s[2] -mul v5.4S, v5.4S,v12.s[2] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v0.4s -add v17.4s, v17.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v30.s[1] -mul v17.4S, v17.4S,v18.s[1] -mla v5.4S, v14.4S, v31.s[0] -sub v14.4s, v11.4s, v21.4s -add v11.4s, v11.4s, v21.4s -sqrdmulh v8.4S, v13.4S, v30.s[2] -mul v13.4S, v13.4S,v18.s[2] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v5.4s -add v9.4s, v9.4s, v5.4s -mla v13.4S, v8.4S, v31.s[0] -sub v8.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sub v30.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -str q6, [x0, #0] -str q15, [x0, #16] -str q28, [x0, #32] -str q10, [x0, #48] -str q2, [x0, #64] -str q3, [x0, #80] -str q4, [x0, #96] -str q29, [x0, #112] -str q11, [x0, #128] -str q14, [x0, #144] -str q9, [x0, #160] -str q0, [x0, #176] -str q19, [x0, #192] -str q8, [x0, #208] -str q22, [x0, #224] -str q30, [x0, #240] -ldr q30, [x0, #288] -ldr q22, [x0, #304] -ldr q8, [x0, #256] -ldr q19, [x0, #272] -ldr q0, [x0, #352] -ldr q9, [x0, #368] -ldr q14, [x0, #320] -ldr q11, [x0, #336] -ldr q29, [x0, #416] -ldr q4, [x0, #432] -ldr q3, [x0, #384] -ldr q2, [x0, #400] -ldr q10, [x0, #480] -ldr q28, [x0, #496] -ldr q15, [x0, #448] -ldr q6, [x0, #464] -ldr q13, [x17, #+256] -ldr q18, [x17, #+272] -ldr q17, [x17, #+288] -ldr q5, [x17, #+304] -ldr q12, [x17, #+320] -ldr q21, [x17, #+336] -ldr q1, [x17, #+352] -ldr q16, [x17, #+368] -sqrdmulh v20.4S, v30.4S, v18.s[0] -mul v30.4S, v30.4S,v13.s[0] -sqrdmulh v7.4S, v22.4S, v18.s[0] -mul v22.4S, v22.4S,v13.s[0] -mla v30.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v0.4S, v5.s[0] -mul v0.4S, v0.4S,v17.s[0] -mla v22.4S, v7.4S, v31.s[0] -sub v7.4s, v8.4s, v30.4s -add v8.4s, v8.4s, v30.4s -sqrdmulh v30.4S, v9.4S, v5.s[0] -mul v9.4S, v9.4S,v17.s[0] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v18.s[1] -mul v19.4S, v19.4S,v13.s[1] -mla v9.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v0.4s -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v20.4S, v18.s[2] -mul v20.4S, v20.4S,v13.s[2] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v9.4s -add v11.4s, v11.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v5.s[1] -mul v11.4S, v11.4S,v17.s[1] -mla v20.4S, v0.4S, v31.s[0] -sub v0.4s, v8.4s, v19.4s -add v8.4s, v8.4s, v19.4s -sqrdmulh v18.4S, v22.4S, v5.s[2] -mul v22.4S, v22.4S,v17.s[2] -mla v11.4S, v9.4S, v31.s[0] -sub v9.4s, v7.4s, v20.4s -add v7.4s, v7.4s, v20.4s -sqrdmulh v20.4S, v29.4S, v21.s[0] -mul v29.4S, v29.4S,v12.s[0] -mla v22.4S, v18.4S, v31.s[0] -sub v18.4s, v14.4s, v11.4s -add v14.4s, v14.4s, v11.4s -sqrdmulh v5.4S, v4.4S, v21.s[0] -mul v4.4S, v4.4S,v12.s[0] -mla v29.4S, v20.4S, v31.s[0] -sub v20.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v10.4S, v16.s[0] -mul v10.4S, v10.4S,v1.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v29.4s -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v28.4S, v16.s[0] -mul v28.4S, v28.4S,v1.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v2.4s, v4.4s -add v2.4s, v2.4s, v4.4s -sqrdmulh v4.4S, v2.4S, v21.s[1] -mul v2.4S, v2.4S,v12.s[1] -mla v28.4S, v29.4S, v31.s[0] -sub v29.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v21.s[2] -mul v22.4S, v22.4S,v12.s[2] -mla v2.4S, v4.4S, v31.s[0] -sub v4.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -sqrdmulh v28.4S, v6.4S, v16.s[1] -mul v6.4S, v6.4S,v1.s[1] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v3.4s, v2.4s -add v3.4s, v3.4s, v2.4s -sqrdmulh v21.4S, v4.4S, v16.s[2] -mul v4.4S, v4.4S,v1.s[2] -mla v6.4S, v28.4S, v31.s[0] -sub v28.4s, v5.4s, v22.4s -add v5.4s, v5.4s, v22.4s -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -sub v16.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -str q8, [x0, #256] -str q0, [x0, #272] -str q7, [x0, #288] -str q9, [x0, #304] -str q14, [x0, #320] -str q18, [x0, #336] -str q30, [x0, #352] -str q20, [x0, #368] -str q3, [x0, #384] -str q10, [x0, #400] -str q5, [x0, #416] -str q28, [x0, #432] -str q15, [x0, #448] -str q21, [x0, #464] -str q29, [x0, #480] -str q16, [x0, #496] -ldr q16, [x0, #544] -ldr q29, [x0, #560] -ldr q21, [x0, #512] -ldr q15, [x0, #528] -ldr q28, [x0, #608] -ldr q5, [x0, #624] -ldr q10, [x0, #576] -ldr q3, [x0, #592] -ldr q20, [x0, #672] -ldr q30, [x0, #688] -ldr q18, [x0, #640] -ldr q14, [x0, #656] -ldr q9, [x0, #736] -ldr q7, [x0, #752] -ldr q0, [x0, #704] -ldr q8, [x0, #720] -ldr q4, [x17, #+384] -ldr q1, [x17, #+400] -ldr q6, [x17, #+416] -ldr q22, [x17, #+432] -ldr q12, [x17, #+448] -ldr q2, [x17, #+464] -ldr q17, [x17, #+480] -ldr q11, [x17, #+496] -sqrdmulh v13.4S, v16.4S, v1.s[0] -mul v16.4S, v16.4S,v4.s[0] -sqrdmulh v19.4S, v29.4S, v1.s[0] -mul v29.4S, v29.4S,v4.s[0] -mla v16.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v28.4S, v22.s[0] -mul v28.4S, v28.4S,v6.s[0] -mla v29.4S, v19.4S, v31.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -sqrdmulh v16.4S, v5.4S, v22.s[0] -mul v5.4S, v5.4S,v6.s[0] -mla v28.4S, v13.4S, v31.s[0] -sub v13.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v15.4S, v1.s[1] -mul v15.4S, v15.4S,v4.s[1] -mla v5.4S, v16.4S, v31.s[0] -sub v16.4s, v10.4s, v28.4s -add v10.4s, v10.4s, v28.4s -sqrdmulh v28.4S, v13.4S, v1.s[2] -mul v13.4S, v13.4S,v4.s[2] -mla v15.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v5.4s -add v3.4s, v3.4s, v5.4s -sqrdmulh v5.4S, v3.4S, v22.s[1] -mul v3.4S, v3.4S,v6.s[1] -mla v13.4S, v28.4S, v31.s[0] -sub v28.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v1.4S, v29.4S, v22.s[2] -mul v29.4S, v29.4S,v6.s[2] -mla v3.4S, v5.4S, v31.s[0] -sub v5.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -sqrdmulh v13.4S, v20.4S, v2.s[0] -mul v20.4S, v20.4S,v12.s[0] -mla v29.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v3.4s -add v10.4s, v10.4s, v3.4s -sqrdmulh v22.4S, v30.4S, v2.s[0] -mul v30.4S, v30.4S,v12.s[0] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v29.4s -add v16.4s, v16.4s, v29.4s -sqrdmulh v29.4S, v9.4S, v11.s[0] -mul v9.4S, v9.4S,v17.s[0] -mla v30.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v20.4s -add v18.4s, v18.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v11.s[0] -mul v7.4S, v7.4S,v17.s[0] -mla v9.4S, v29.4S, v31.s[0] -sub v29.4s, v14.4s, v30.4s -add v14.4s, v14.4s, v30.4s -sqrdmulh v30.4S, v14.4S, v2.s[1] -mul v14.4S, v14.4S,v12.s[1] -mla v7.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v9.4s -add v0.4s, v0.4s, v9.4s -sqrdmulh v9.4S, v29.4S, v2.s[2] -mul v29.4S, v29.4S,v12.s[2] -mla v14.4S, v30.4S, v31.s[0] -sub v30.4s, v8.4s, v7.4s -add v8.4s, v8.4s, v7.4s -sqrdmulh v7.4S, v8.4S, v11.s[1] -mul v8.4S, v8.4S,v17.s[1] -mla v29.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -sqrdmulh v2.4S, v30.4S, v11.s[2] -mul v30.4S, v30.4S,v17.s[2] -mla v8.4S, v7.4S, v31.s[0] -sub v7.4s, v22.4s, v29.4s -add v22.4s, v22.4s, v29.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -sub v11.4s, v20.4s, v30.4s -add v20.4s, v20.4s, v30.4s -str q21, [x0, #512] -str q28, [x0, #528] -str q19, [x0, #544] -str q5, [x0, #560] -str q10, [x0, #576] -str q1, [x0, #592] -str q16, [x0, #608] -str q13, [x0, #624] -str q18, [x0, #640] -str q9, [x0, #656] -str q22, [x0, #672] -str q7, [x0, #688] -str q0, [x0, #704] -str q2, [x0, #720] -str q20, [x0, #736] -str q11, [x0, #752] -ldr q11, [x0, #800] -ldr q20, [x0, #816] -ldr q2, [x0, #768] -ldr q0, [x0, #784] -ldr q7, [x0, #864] -ldr q22, [x0, #880] -ldr q9, [x0, #832] -ldr q18, [x0, #848] -ldr q13, [x0, #928] -ldr q16, [x0, #944] -ldr q1, [x0, #896] -ldr q10, [x0, #912] -ldr q5, [x0, #992] -ldr q19, [x0, #1008] -ldr q28, [x0, #960] -ldr q21, [x0, #976] -ldr q30, [x17, #+512] -ldr q17, [x17, #+528] -ldr q8, [x17, #+544] -ldr q29, [x17, #+560] -ldr q12, [x17, #+576] -ldr q14, [x17, #+592] -ldr q6, [x17, #+608] -ldr q3, [x17, #+624] -sqrdmulh v4.4S, v11.4S, v17.s[0] -mul v11.4S, v11.4S,v30.s[0] -sqrdmulh v15.4S, v20.4S, v17.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v11.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v8.s[0] -mla v20.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v11.4s -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v8.s[0] -mla v7.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v17.s[1] -mul v0.4S, v0.4S,v30.s[1] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v9.4s, v7.4s -add v9.4s, v9.4s, v7.4s -sqrdmulh v7.4S, v4.4S, v17.s[2] -mul v4.4S, v4.4S,v30.s[2] -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v8.s[1] -mla v4.4S, v7.4S, v31.s[0] -sub v7.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v17.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v8.s[2] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v4.4s -add v15.4s, v15.4s, v4.4s -sqrdmulh v4.4S, v13.4S, v14.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v20.4S, v17.4S, v31.s[0] -sub v17.4s, v9.4s, v18.4s -add v9.4s, v9.4s, v18.4s -sqrdmulh v29.4S, v16.4S, v14.s[0] -mul v16.4S, v16.4S,v12.s[0] -mla v13.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v5.4S, v3.s[0] -mul v5.4S, v5.4S,v6.s[0] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v1.4s, v13.4s -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v19.4S, v3.s[0] -mul v19.4S, v19.4S,v6.s[0] -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v10.4s, v16.4s -add v10.4s, v10.4s, v16.4s -sqrdmulh v16.4S, v10.4S, v14.s[1] -mul v10.4S, v10.4S,v12.s[1] -mla v19.4S, v13.4S, v31.s[0] -sub v13.4s, v28.4s, v5.4s -add v28.4s, v28.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v14.s[2] -mul v20.4S, v20.4S,v12.s[2] -mla v10.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v19.4s -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v21.4S, v3.s[1] -mul v21.4S, v21.4S,v6.s[1] -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v1.4s, v10.4s -add v1.4s, v1.4s, v10.4s -sqrdmulh v14.4S, v16.4S, v3.s[2] -mul v16.4S, v16.4S,v6.s[2] -mla v21.4S, v19.4S, v31.s[0] -sub v19.4s, v29.4s, v20.4s -add v29.4s, v29.4s, v20.4s -mla v16.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v21.4s -add v28.4s, v28.4s, v21.4s -sub v3.4s, v13.4s, v16.4s -add v13.4s, v13.4s, v16.4s -str q2, [x0, #768] -str q7, [x0, #784] -str q15, [x0, #800] -str q22, [x0, #816] -str q9, [x0, #832] -str q17, [x0, #848] -str q11, [x0, #864] -str q4, [x0, #880] -str q1, [x0, #896] -str q5, [x0, #912] -str q29, [x0, #928] -str q19, [x0, #944] -str q28, [x0, #960] -str q14, [x0, #976] -str q13, [x0, #992] -str q3, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s deleted file mode 100644 index a72c6cd..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_2.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2 -.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2 -ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2: -_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_2: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #96] -ldr q14, [x0, #112] -ldr q0, [x0, #64] -ldr q19, [x0, #160] -ldr q17, [x0, #176] -ldr q20, [x0, #128] -ldr q3, [x0, #224] -ldr q1, [x0, #240] -ldr q9, [x0, #192] -sqrdmulh v12.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v12.4S, v31.s[0] -sub v12.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -ldr q22, [x0, #16] -sqrdmulh v8.4S, v21.4S, v7.s[0] -mul v21.4S, v21.4S,v6.s[0] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -ldr q21, [x0, #80] -sqrdmulh v18.4S, v19.4S, v10.s[0] -mul v19.4S, v19.4S,v15.s[0] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -ldr q19, [x0, #144] -sqrdmulh v30.4S, v3.4S, v16.s[0] -mul v3.4S, v3.4S,v2.s[0] -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -ldr q3, [x0, #208] -sqrdmulh v29.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v4.s[0] -mla v13.4S, v29.4S, v31.s[0] -sub v29.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v7.s[0] -mul v14.4S, v14.4S,v6.s[0] -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v14.4s -add v21.4s, v21.4s, v14.4s -sqrdmulh v14.4S, v17.4S, v10.s[0] -mul v17.4S, v17.4S,v15.s[0] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v19.4s, v17.4s -add v19.4s, v19.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v16.s[0] -mul v1.4S, v1.4S,v2.s[0] -mla v1.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v1.4s -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v5.s[1] -mul v22.4S, v22.4S,v4.s[1] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v11.4s, v22.4s -add v11.4s, v11.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v7.s[1] -mul v21.4S, v21.4S,v6.s[1] -mla v21.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v21.4s -add v0.4s, v0.4s, v21.4s -str q11, [x0, #0] -str q1, [x0, #16] -sqrdmulh v1.4S, v19.4S, v10.s[1] -mul v19.4S, v19.4S,v15.s[1] -mla v19.4S, v1.4S, v31.s[0] -sub v1.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -str q0, [x0, #64] -str q22, [x0, #80] -sqrdmulh v22.4S, v3.4S, v16.s[1] -mul v3.4S, v3.4S,v2.s[1] -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v3.4s -add v9.4s, v9.4s, v3.4s -str q20, [x0, #128] -str q1, [x0, #144] -sqrdmulh v1.4S, v29.4S, v5.s[2] -mul v29.4S, v29.4S,v4.s[2] -mla v29.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v29.4s -add v12.4s, v12.4s, v29.4s -str q9, [x0, #192] -str q22, [x0, #208] -ldr q5, [x17, #+256] -ldr q4, [x17, #+272] -sqrdmulh v22.4S, v13.4S, v7.s[2] -mul v13.4S, v13.4S,v6.s[2] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v13.4s -add v8.4s, v8.4s, v13.4s -ldr q7, [x17, #+288] -ldr q6, [x17, #+304] -sqrdmulh v13.4S, v14.4S, v10.s[2] -mul v14.4S, v14.4S,v15.s[2] -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v14.4s -add v18.4s, v18.4s, v14.4s -ldr q10, [x17, #+320] -ldr q15, [x17, #+336] -sqrdmulh v14.4S, v17.4S, v16.s[2] -mul v17.4S, v17.4S,v2.s[2] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v30.4s, v17.4s -add v30.4s, v30.4s, v17.4s -ldr q16, [x17, #+352] -ldr q2, [x17, #+368] -str q12, [x0, #32] -str q1, [x0, #48] -str q8, [x0, #96] -str q22, [x0, #112] -str q18, [x0, #160] -str q13, [x0, #176] -str q30, [x0, #224] -str q14, [x0, #240] -ldr q14, [x0, #288] -ldr q30, [x0, #304] -ldr q13, [x0, #256] -ldr q18, [x0, #352] -ldr q22, [x0, #368] -ldr q8, [x0, #320] -ldr q1, [x0, #416] -ldr q12, [x0, #432] -ldr q17, [x0, #384] -ldr q9, [x0, #480] -ldr q29, [x0, #496] -ldr q20, [x0, #448] -sqrdmulh v3.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v5.s[0] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -ldr q14, [x0, #272] -sqrdmulh v0.4S, v18.4S, v6.s[0] -mul v18.4S, v18.4S,v7.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -ldr q18, [x0, #336] -sqrdmulh v19.4S, v1.4S, v15.s[0] -mul v1.4S, v1.4S,v10.s[0] -mla v1.4S, v19.4S, v31.s[0] -sub v19.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -ldr q1, [x0, #400] -sqrdmulh v11.4S, v9.4S, v2.s[0] -mul v9.4S, v9.4S,v16.s[0] -mla v9.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v9.4s -add v20.4s, v20.4s, v9.4s -ldr q9, [x0, #464] -sqrdmulh v21.4S, v30.4S, v4.s[0] -mul v30.4S, v30.4S,v5.s[0] -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v14.4s, v30.4s -add v14.4s, v14.4s, v30.4s -sqrdmulh v30.4S, v22.4S, v6.s[0] -mul v22.4S, v22.4S,v7.s[0] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v18.4s, v22.4s -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v12.4S, v15.s[0] -mul v12.4S, v12.4S,v10.s[0] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v12.4s -add v1.4s, v1.4s, v12.4s -sqrdmulh v12.4S, v29.4S, v2.s[0] -mul v29.4S, v29.4S,v16.s[0] -mla v29.4S, v12.4S, v31.s[0] -sub v12.4s, v9.4s, v29.4s -add v9.4s, v9.4s, v29.4s -sqrdmulh v29.4S, v14.4S, v4.s[1] -mul v14.4S, v14.4S,v5.s[1] -mla v14.4S, v29.4S, v31.s[0] -sub v29.4s, v13.4s, v14.4s -add v13.4s, v13.4s, v14.4s -sqrdmulh v14.4S, v18.4S, v6.s[1] -mul v18.4S, v18.4S,v7.s[1] -mla v18.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v18.4s -add v8.4s, v8.4s, v18.4s -str q13, [x0, #256] -str q29, [x0, #272] -sqrdmulh v29.4S, v1.4S, v15.s[1] -mul v1.4S, v1.4S,v10.s[1] -mla v1.4S, v29.4S, v31.s[0] -sub v29.4s, v17.4s, v1.4s -add v17.4s, v17.4s, v1.4s -str q8, [x0, #320] -str q14, [x0, #336] -sqrdmulh v14.4S, v9.4S, v2.s[1] -mul v9.4S, v9.4S,v16.s[1] -mla v9.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v9.4s -add v20.4s, v20.4s, v9.4s -str q17, [x0, #384] -str q29, [x0, #400] -sqrdmulh v29.4S, v21.4S, v4.s[2] -mul v21.4S, v21.4S,v5.s[2] -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -str q20, [x0, #448] -str q14, [x0, #464] -ldr q4, [x17, #+384] -ldr q5, [x17, #+400] -sqrdmulh v14.4S, v30.4S, v6.s[2] -mul v30.4S, v30.4S,v7.s[2] -mla v30.4S, v14.4S, v31.s[0] -sub v14.4s, v0.4s, v30.4s -add v0.4s, v0.4s, v30.4s -ldr q6, [x17, #+416] -ldr q7, [x17, #+432] -sqrdmulh v30.4S, v22.4S, v15.s[2] -mul v22.4S, v22.4S,v10.s[2] -mla v22.4S, v30.4S, v31.s[0] -sub v30.4s, v19.4s, v22.4s -add v19.4s, v19.4s, v22.4s -ldr q15, [x17, #+448] -ldr q10, [x17, #+464] -sqrdmulh v22.4S, v12.4S, v2.s[2] -mul v12.4S, v12.4S,v16.s[2] -mla v12.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -ldr q2, [x17, #+480] -ldr q16, [x17, #+496] -str q3, [x0, #288] -str q29, [x0, #304] -str q0, [x0, #352] -str q14, [x0, #368] -str q19, [x0, #416] -str q30, [x0, #432] -str q11, [x0, #480] -str q22, [x0, #496] -ldr q22, [x0, #544] -ldr q11, [x0, #560] -ldr q30, [x0, #512] -ldr q19, [x0, #608] -ldr q14, [x0, #624] -ldr q0, [x0, #576] -ldr q29, [x0, #672] -ldr q3, [x0, #688] -ldr q12, [x0, #640] -ldr q20, [x0, #736] -ldr q21, [x0, #752] -ldr q17, [x0, #704] -sqrdmulh v9.4S, v22.4S, v5.s[0] -mul v22.4S, v22.4S,v4.s[0] -mla v22.4S, v9.4S, v31.s[0] -sub v9.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -ldr q22, [x0, #528] -sqrdmulh v8.4S, v19.4S, v7.s[0] -mul v19.4S, v19.4S,v6.s[0] -mla v19.4S, v8.4S, v31.s[0] -sub v8.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -ldr q19, [x0, #592] -sqrdmulh v1.4S, v29.4S, v10.s[0] -mul v29.4S, v29.4S,v15.s[0] -mla v29.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v29.4s -add v12.4s, v12.4s, v29.4s -ldr q29, [x0, #656] -sqrdmulh v13.4S, v20.4S, v16.s[0] -mul v20.4S, v20.4S,v2.s[0] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -ldr q20, [x0, #720] -sqrdmulh v18.4S, v11.4S, v5.s[0] -mul v11.4S, v11.4S,v4.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v22.4s, v11.4s -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v14.4S, v7.s[0] -mul v14.4S, v14.4S,v6.s[0] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v19.4s, v14.4s -add v19.4s, v19.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v10.s[0] -mul v3.4S, v3.4S,v15.s[0] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v16.s[0] -mul v21.4S, v21.4S,v2.s[0] -mla v21.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v5.s[1] -mul v22.4S, v22.4S,v4.s[1] -mla v22.4S, v21.4S, v31.s[0] -sub v21.4s, v30.4s, v22.4s -add v30.4s, v30.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v7.s[1] -mul v19.4S, v19.4S,v6.s[1] -mla v19.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -str q30, [x0, #512] -str q21, [x0, #528] -sqrdmulh v21.4S, v29.4S, v10.s[1] -mul v29.4S, v29.4S,v15.s[1] -mla v29.4S, v21.4S, v31.s[0] -sub v21.4s, v12.4s, v29.4s -add v12.4s, v12.4s, v29.4s -str q0, [x0, #576] -str q22, [x0, #592] -sqrdmulh v22.4S, v20.4S, v16.s[1] -mul v20.4S, v20.4S,v2.s[1] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v17.4s, v20.4s -add v17.4s, v17.4s, v20.4s -str q12, [x0, #640] -str q21, [x0, #656] -sqrdmulh v21.4S, v18.4S, v5.s[2] -mul v18.4S, v18.4S,v4.s[2] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v18.4s -add v9.4s, v9.4s, v18.4s -str q17, [x0, #704] -str q22, [x0, #720] -ldr q5, [x17, #+512] -ldr q4, [x17, #+528] -sqrdmulh v22.4S, v11.4S, v7.s[2] -mul v11.4S, v11.4S,v6.s[2] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -ldr q7, [x17, #+544] -ldr q6, [x17, #+560] -sqrdmulh v11.4S, v14.4S, v10.s[2] -mul v14.4S, v14.4S,v15.s[2] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v1.4s, v14.4s -add v1.4s, v1.4s, v14.4s -ldr q10, [x17, #+576] -ldr q15, [x17, #+592] -sqrdmulh v14.4S, v3.4S, v16.s[2] -mul v3.4S, v3.4S,v2.s[2] -mla v3.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -ldr q16, [x17, #+608] -ldr q2, [x17, #+624] -str q9, [x0, #544] -str q21, [x0, #560] -str q8, [x0, #608] -str q22, [x0, #624] -str q1, [x0, #672] -str q11, [x0, #688] -str q13, [x0, #736] -str q14, [x0, #752] -ldr q14, [x0, #800] -ldr q13, [x0, #816] -ldr q11, [x0, #768] -ldr q1, [x0, #864] -ldr q22, [x0, #880] -ldr q8, [x0, #832] -ldr q21, [x0, #928] -ldr q9, [x0, #944] -ldr q3, [x0, #896] -ldr q17, [x0, #992] -ldr q18, [x0, #1008] -ldr q12, [x0, #960] -sqrdmulh v20.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v5.s[0] -mla v14.4S, v20.4S, v31.s[0] -sub v20.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -ldr q14, [x0, #784] -sqrdmulh v0.4S, v1.4S, v6.s[0] -mul v1.4S, v1.4S,v7.s[0] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -ldr q1, [x0, #848] -sqrdmulh v29.4S, v21.4S, v15.s[0] -mul v21.4S, v21.4S,v10.s[0] -mla v21.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -ldr q21, [x0, #912] -sqrdmulh v30.4S, v17.4S, v2.s[0] -mul v17.4S, v17.4S,v16.s[0] -mla v17.4S, v30.4S, v31.s[0] -sub v30.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -ldr q17, [x0, #976] -sqrdmulh v19.4S, v13.4S, v4.s[0] -mul v13.4S, v13.4S,v5.s[0] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v6.s[0] -mul v22.4S, v22.4S,v7.s[0] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -sqrdmulh v22.4S, v9.4S, v15.s[0] -mul v9.4S, v9.4S,v10.s[0] -mla v9.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v9.4s -add v21.4s, v21.4s, v9.4s -sqrdmulh v9.4S, v18.4S, v2.s[0] -mul v18.4S, v18.4S,v16.s[0] -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v18.4s -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v14.4S, v4.s[1] -mul v14.4S, v14.4S,v5.s[1] -mla v14.4S, v18.4S, v31.s[0] -sub v18.4s, v11.4s, v14.4s -add v11.4s, v11.4s, v14.4s -sqrdmulh v14.4S, v1.4S, v6.s[1] -mul v1.4S, v1.4S,v7.s[1] -mla v1.4S, v14.4S, v31.s[0] -sub v14.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q11, [x0, #768] -str q18, [x0, #784] -sqrdmulh v18.4S, v21.4S, v15.s[1] -mul v21.4S, v21.4S,v10.s[1] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v21.4s -add v3.4s, v3.4s, v21.4s -str q8, [x0, #832] -str q14, [x0, #848] -sqrdmulh v14.4S, v17.4S, v2.s[1] -mul v17.4S, v17.4S,v16.s[1] -mla v17.4S, v14.4S, v31.s[0] -sub v14.4s, v12.4s, v17.4s -add v12.4s, v12.4s, v17.4s -str q3, [x0, #896] -str q18, [x0, #912] -sqrdmulh v18.4S, v19.4S, v4.s[2] -mul v19.4S, v19.4S,v5.s[2] -mla v19.4S, v18.4S, v31.s[0] -sub v18.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -str q12, [x0, #960] -str q14, [x0, #976] -sqrdmulh v4.4S, v13.4S, v6.s[2] -mul v13.4S, v13.4S,v7.s[2] -mla v13.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v13.4s -add v0.4s, v0.4s, v13.4s -sqrdmulh v6.4S, v22.4S, v15.s[2] -mul v22.4S, v22.4S,v10.s[2] -mla v22.4S, v6.4S, v31.s[0] -sub v6.4s, v29.4s, v22.4s -add v29.4s, v29.4s, v22.4s -sqrdmulh v15.4S, v9.4S, v2.s[2] -mul v9.4S, v9.4S,v16.s[2] -mla v9.4S, v15.4S, v31.s[0] -sub v15.4s, v30.4s, v9.4s -add v30.4s, v30.4s, v9.4s -str q20, [x0, #800] -str q18, [x0, #816] -str q0, [x0, #864] -str q4, [x0, #880] -str q29, [x0, #928] -str q6, [x0, #944] -str q30, [x0, #992] -str q15, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s deleted file mode 100644 index ed2fb5d..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_3.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3 -.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3 -ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3: -_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_3: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x17, #+160] -ldr q7, [x17, #+176] -ldr q15, [x17, #+192] -ldr q10, [x17, #+208] -ldr q2, [x17, #+224] -ldr q16, [x17, #+240] -ldr q22, [x0, #32] -ldr q13, [x0, #48] -ldr q11, [x0, #0] -ldr q21, [x0, #96] -ldr q14, [x0, #112] -ldr q0, [x0, #64] -ldr q19, [x0, #160] -ldr q17, [x0, #176] -ldr q20, [x0, #128] -ldr q3, [x0, #224] -ldr q1, [x0, #240] -ldr q9, [x0, #192] -sqrdmulh v12.4S, v22.4S, v5.s[0] -sqrdmulh v8.4S, v21.4S, v7.s[0] -sqrdmulh v18.4S, v19.4S, v10.s[0] -sqrdmulh v30.4S, v3.4S, v16.s[0] -mul v22.4S, v22.4S,v4.s[0] -mul v21.4S, v21.4S,v6.s[0] -mul v19.4S, v19.4S,v15.s[0] -mul v3.4S, v3.4S,v2.s[0] -mla v22.4S, v12.4S, v31.s[0] -mla v21.4S, v8.4S, v31.s[0] -mla v19.4S, v18.4S, v31.s[0] -mla v3.4S, v30.4S, v31.s[0] -sub v30.4s, v11.4s, v22.4s -sub v18.4s, v0.4s, v21.4s -sub v8.4s, v20.4s, v19.4s -sub v12.4s, v9.4s, v3.4s -add v11.4s, v11.4s, v22.4s -add v0.4s, v0.4s, v21.4s -add v20.4s, v20.4s, v19.4s -add v9.4s, v9.4s, v3.4s -ldr q3, [x0, #16] -ldr q19, [x0, #80] -ldr q21, [x0, #144] -ldr q22, [x0, #208] -sqrdmulh v29.4S, v13.4S, v5.s[0] -sqrdmulh v28.4S, v14.4S, v7.s[0] -sqrdmulh v27.4S, v17.4S, v10.s[0] -sqrdmulh v26.4S, v1.4S, v16.s[0] -mul v13.4S, v13.4S,v4.s[0] -mul v14.4S, v14.4S,v6.s[0] -mul v17.4S, v17.4S,v15.s[0] -mul v1.4S, v1.4S,v2.s[0] -mla v13.4S, v29.4S, v31.s[0] -mla v14.4S, v28.4S, v31.s[0] -mla v17.4S, v27.4S, v31.s[0] -mla v1.4S, v26.4S, v31.s[0] -sub v26.4s, v3.4s, v13.4s -sub v27.4s, v19.4s, v14.4s -sub v28.4s, v21.4s, v17.4s -sub v29.4s, v22.4s, v1.4s -add v3.4s, v3.4s, v13.4s -add v19.4s, v19.4s, v14.4s -add v21.4s, v21.4s, v17.4s -add v22.4s, v22.4s, v1.4s -sqrdmulh v1.4S, v3.4S, v5.s[1] -sqrdmulh v17.4S, v19.4S, v7.s[1] -sqrdmulh v14.4S, v21.4S, v10.s[1] -sqrdmulh v13.4S, v22.4S, v16.s[1] -mul v3.4S, v3.4S,v4.s[1] -mul v19.4S, v19.4S,v6.s[1] -mul v21.4S, v21.4S,v15.s[1] -mul v22.4S, v22.4S,v2.s[1] -mla v3.4S, v1.4S, v31.s[0] -mla v19.4S, v17.4S, v31.s[0] -mla v21.4S, v14.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v3.4s -sub v14.4s, v0.4s, v19.4s -sub v17.4s, v20.4s, v21.4s -sub v1.4s, v9.4s, v22.4s -add v11.4s, v11.4s, v3.4s -add v0.4s, v0.4s, v19.4s -add v20.4s, v20.4s, v21.4s -add v9.4s, v9.4s, v22.4s -sqrdmulh v22.4S, v26.4S, v5.s[2] -sqrdmulh v21.4S, v27.4S, v7.s[2] -sqrdmulh v19.4S, v28.4S, v10.s[2] -sqrdmulh v3.4S, v29.4S, v16.s[2] -str q11, [x0, #0] -str q13, [x0, #16] -mul v26.4S, v26.4S,v4.s[2] -mul v27.4S, v27.4S,v6.s[2] -mul v28.4S, v28.4S,v15.s[2] -mul v29.4S, v29.4S,v2.s[2] -str q0, [x0, #64] -str q14, [x0, #80] -ldr q16, [x17, #+256] -ldr q2, [x17, #+272] -ldr q10, [x17, #+288] -ldr q15, [x17, #+304] -mla v26.4S, v22.4S, v31.s[0] -mla v27.4S, v21.4S, v31.s[0] -mla v28.4S, v19.4S, v31.s[0] -mla v29.4S, v3.4S, v31.s[0] -str q20, [x0, #128] -str q17, [x0, #144] -ldr q17, [x17, #+320] -ldr q20, [x17, #+336] -sub v3.4s, v30.4s, v26.4s -sub v19.4s, v18.4s, v27.4s -sub v21.4s, v8.4s, v28.4s -sub v22.4s, v12.4s, v29.4s -str q9, [x0, #192] -str q1, [x0, #208] -ldr q1, [x17, #+352] -ldr q9, [x17, #+368] -add v30.4s, v30.4s, v26.4s -add v18.4s, v18.4s, v27.4s -add v8.4s, v8.4s, v28.4s -add v12.4s, v12.4s, v29.4s -str q30, [x0, #32] -str q18, [x0, #96] -str q8, [x0, #160] -str q12, [x0, #224] -ldr q12, [x0, #288] -ldr q8, [x0, #304] -ldr q18, [x0, #256] -ldr q30, [x0, #352] -ldr q29, [x0, #368] -ldr q28, [x0, #320] -ldr q27, [x0, #416] -ldr q26, [x0, #432] -ldr q7, [x0, #384] -ldr q6, [x0, #480] -ldr q5, [x0, #496] -ldr q4, [x0, #448] -sqrdmulh v14.4S, v12.4S, v2.s[0] -sqrdmulh v0.4S, v30.4S, v15.s[0] -sqrdmulh v13.4S, v27.4S, v20.s[0] -sqrdmulh v11.4S, v6.4S, v9.s[0] -str q3, [x0, #48] -mul v12.4S, v12.4S,v16.s[0] -mul v30.4S, v30.4S,v10.s[0] -mul v27.4S, v27.4S,v17.s[0] -mul v6.4S, v6.4S,v1.s[0] -str q19, [x0, #112] -mla v12.4S, v14.4S, v31.s[0] -mla v30.4S, v0.4S, v31.s[0] -mla v27.4S, v13.4S, v31.s[0] -mla v6.4S, v11.4S, v31.s[0] -str q21, [x0, #176] -sub v21.4s, v18.4s, v12.4s -sub v11.4s, v28.4s, v30.4s -sub v13.4s, v7.4s, v27.4s -sub v0.4s, v4.4s, v6.4s -str q22, [x0, #240] -add v18.4s, v18.4s, v12.4s -add v28.4s, v28.4s, v30.4s -add v7.4s, v7.4s, v27.4s -add v4.4s, v4.4s, v6.4s -ldr q6, [x0, #272] -ldr q27, [x0, #336] -ldr q30, [x0, #400] -ldr q12, [x0, #464] -sqrdmulh v22.4S, v8.4S, v2.s[0] -sqrdmulh v14.4S, v29.4S, v15.s[0] -sqrdmulh v19.4S, v26.4S, v20.s[0] -sqrdmulh v3.4S, v5.4S, v9.s[0] -mul v8.4S, v8.4S,v16.s[0] -mul v29.4S, v29.4S,v10.s[0] -mul v26.4S, v26.4S,v17.s[0] -mul v5.4S, v5.4S,v1.s[0] -mla v8.4S, v22.4S, v31.s[0] -mla v29.4S, v14.4S, v31.s[0] -mla v26.4S, v19.4S, v31.s[0] -mla v5.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v8.4s -sub v19.4s, v27.4s, v29.4s -sub v14.4s, v30.4s, v26.4s -sub v22.4s, v12.4s, v5.4s -add v6.4s, v6.4s, v8.4s -add v27.4s, v27.4s, v29.4s -add v30.4s, v30.4s, v26.4s -add v12.4s, v12.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v2.s[1] -sqrdmulh v26.4S, v27.4S, v15.s[1] -sqrdmulh v29.4S, v30.4S, v20.s[1] -sqrdmulh v8.4S, v12.4S, v9.s[1] -mul v6.4S, v6.4S,v16.s[1] -mul v27.4S, v27.4S,v10.s[1] -mul v30.4S, v30.4S,v17.s[1] -mul v12.4S, v12.4S,v1.s[1] -mla v6.4S, v5.4S, v31.s[0] -mla v27.4S, v26.4S, v31.s[0] -mla v30.4S, v29.4S, v31.s[0] -mla v12.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v6.4s -sub v29.4s, v28.4s, v27.4s -sub v26.4s, v7.4s, v30.4s -sub v5.4s, v4.4s, v12.4s -add v18.4s, v18.4s, v6.4s -add v28.4s, v28.4s, v27.4s -add v7.4s, v7.4s, v30.4s -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v2.s[2] -sqrdmulh v30.4S, v19.4S, v15.s[2] -sqrdmulh v27.4S, v14.4S, v20.s[2] -sqrdmulh v6.4S, v22.4S, v9.s[2] -str q18, [x0, #256] -str q8, [x0, #272] -mul v3.4S, v3.4S,v16.s[2] -mul v19.4S, v19.4S,v10.s[2] -mul v14.4S, v14.4S,v17.s[2] -mul v22.4S, v22.4S,v1.s[2] -str q28, [x0, #320] -str q29, [x0, #336] -ldr q9, [x17, #+384] -ldr q1, [x17, #+400] -ldr q20, [x17, #+416] -ldr q17, [x17, #+432] -mla v3.4S, v12.4S, v31.s[0] -mla v19.4S, v30.4S, v31.s[0] -mla v14.4S, v27.4S, v31.s[0] -mla v22.4S, v6.4S, v31.s[0] -str q7, [x0, #384] -str q26, [x0, #400] -ldr q26, [x17, #+448] -ldr q7, [x17, #+464] -sub v6.4s, v21.4s, v3.4s -sub v27.4s, v11.4s, v19.4s -sub v30.4s, v13.4s, v14.4s -sub v12.4s, v0.4s, v22.4s -str q4, [x0, #448] -str q5, [x0, #464] -ldr q5, [x17, #+480] -ldr q4, [x17, #+496] -add v21.4s, v21.4s, v3.4s -add v11.4s, v11.4s, v19.4s -add v13.4s, v13.4s, v14.4s -add v0.4s, v0.4s, v22.4s -str q21, [x0, #288] -str q11, [x0, #352] -str q13, [x0, #416] -str q0, [x0, #480] -ldr q0, [x0, #544] -ldr q13, [x0, #560] -ldr q11, [x0, #512] -ldr q21, [x0, #608] -ldr q22, [x0, #624] -ldr q14, [x0, #576] -ldr q19, [x0, #672] -ldr q3, [x0, #688] -ldr q15, [x0, #640] -ldr q10, [x0, #736] -ldr q2, [x0, #752] -ldr q16, [x0, #704] -sqrdmulh v29.4S, v0.4S, v1.s[0] -sqrdmulh v28.4S, v21.4S, v17.s[0] -sqrdmulh v8.4S, v19.4S, v7.s[0] -sqrdmulh v18.4S, v10.4S, v4.s[0] -str q6, [x0, #304] -mul v0.4S, v0.4S,v9.s[0] -mul v21.4S, v21.4S,v20.s[0] -mul v19.4S, v19.4S,v26.s[0] -mul v10.4S, v10.4S,v5.s[0] -str q27, [x0, #368] -mla v0.4S, v29.4S, v31.s[0] -mla v21.4S, v28.4S, v31.s[0] -mla v19.4S, v8.4S, v31.s[0] -mla v10.4S, v18.4S, v31.s[0] -str q30, [x0, #432] -sub v30.4s, v11.4s, v0.4s -sub v18.4s, v14.4s, v21.4s -sub v8.4s, v15.4s, v19.4s -sub v28.4s, v16.4s, v10.4s -str q12, [x0, #496] -add v11.4s, v11.4s, v0.4s -add v14.4s, v14.4s, v21.4s -add v15.4s, v15.4s, v19.4s -add v16.4s, v16.4s, v10.4s -ldr q10, [x0, #528] -ldr q19, [x0, #592] -ldr q21, [x0, #656] -ldr q0, [x0, #720] -sqrdmulh v12.4S, v13.4S, v1.s[0] -sqrdmulh v29.4S, v22.4S, v17.s[0] -sqrdmulh v27.4S, v3.4S, v7.s[0] -sqrdmulh v6.4S, v2.4S, v4.s[0] -mul v13.4S, v13.4S,v9.s[0] -mul v22.4S, v22.4S,v20.s[0] -mul v3.4S, v3.4S,v26.s[0] -mul v2.4S, v2.4S,v5.s[0] -mla v13.4S, v12.4S, v31.s[0] -mla v22.4S, v29.4S, v31.s[0] -mla v3.4S, v27.4S, v31.s[0] -mla v2.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v13.4s -sub v27.4s, v19.4s, v22.4s -sub v29.4s, v21.4s, v3.4s -sub v12.4s, v0.4s, v2.4s -add v10.4s, v10.4s, v13.4s -add v19.4s, v19.4s, v22.4s -add v21.4s, v21.4s, v3.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v1.s[1] -sqrdmulh v3.4S, v19.4S, v17.s[1] -sqrdmulh v22.4S, v21.4S, v7.s[1] -sqrdmulh v13.4S, v0.4S, v4.s[1] -mul v10.4S, v10.4S,v9.s[1] -mul v19.4S, v19.4S,v20.s[1] -mul v21.4S, v21.4S,v26.s[1] -mul v0.4S, v0.4S,v5.s[1] -mla v10.4S, v2.4S, v31.s[0] -mla v19.4S, v3.4S, v31.s[0] -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v11.4s, v10.4s -sub v22.4s, v14.4s, v19.4s -sub v3.4s, v15.4s, v21.4s -sub v2.4s, v16.4s, v0.4s -add v11.4s, v11.4s, v10.4s -add v14.4s, v14.4s, v19.4s -add v15.4s, v15.4s, v21.4s -add v16.4s, v16.4s, v0.4s -sqrdmulh v0.4S, v6.4S, v1.s[2] -sqrdmulh v21.4S, v27.4S, v17.s[2] -sqrdmulh v19.4S, v29.4S, v7.s[2] -sqrdmulh v10.4S, v12.4S, v4.s[2] -str q11, [x0, #512] -str q13, [x0, #528] -mul v6.4S, v6.4S,v9.s[2] -mul v27.4S, v27.4S,v20.s[2] -mul v29.4S, v29.4S,v26.s[2] -mul v12.4S, v12.4S,v5.s[2] -str q14, [x0, #576] -str q22, [x0, #592] -ldr q4, [x17, #+512] -ldr q5, [x17, #+528] -ldr q7, [x17, #+544] -ldr q26, [x17, #+560] -mla v6.4S, v0.4S, v31.s[0] -mla v27.4S, v21.4S, v31.s[0] -mla v29.4S, v19.4S, v31.s[0] -mla v12.4S, v10.4S, v31.s[0] -str q15, [x0, #640] -str q3, [x0, #656] -ldr q3, [x17, #+576] -ldr q15, [x17, #+592] -sub v10.4s, v30.4s, v6.4s -sub v19.4s, v18.4s, v27.4s -sub v21.4s, v8.4s, v29.4s -sub v0.4s, v28.4s, v12.4s -str q16, [x0, #704] -str q2, [x0, #720] -ldr q2, [x17, #+608] -ldr q16, [x17, #+624] -add v30.4s, v30.4s, v6.4s -add v18.4s, v18.4s, v27.4s -add v8.4s, v8.4s, v29.4s -add v28.4s, v28.4s, v12.4s -str q30, [x0, #544] -str q18, [x0, #608] -str q8, [x0, #672] -str q28, [x0, #736] -ldr q28, [x0, #800] -ldr q8, [x0, #816] -ldr q18, [x0, #768] -ldr q30, [x0, #864] -ldr q12, [x0, #880] -ldr q29, [x0, #832] -ldr q27, [x0, #928] -ldr q6, [x0, #944] -ldr q17, [x0, #896] -ldr q20, [x0, #992] -ldr q1, [x0, #1008] -ldr q9, [x0, #960] -sqrdmulh v22.4S, v28.4S, v5.s[0] -sqrdmulh v14.4S, v30.4S, v26.s[0] -sqrdmulh v13.4S, v27.4S, v15.s[0] -sqrdmulh v11.4S, v20.4S, v16.s[0] -str q10, [x0, #560] -mul v28.4S, v28.4S,v4.s[0] -mul v30.4S, v30.4S,v7.s[0] -mul v27.4S, v27.4S,v3.s[0] -mul v20.4S, v20.4S,v2.s[0] -str q19, [x0, #624] -mla v28.4S, v22.4S, v31.s[0] -mla v30.4S, v14.4S, v31.s[0] -mla v27.4S, v13.4S, v31.s[0] -mla v20.4S, v11.4S, v31.s[0] -str q21, [x0, #688] -sub v21.4s, v18.4s, v28.4s -sub v11.4s, v29.4s, v30.4s -sub v13.4s, v17.4s, v27.4s -sub v14.4s, v9.4s, v20.4s -str q0, [x0, #752] -add v18.4s, v18.4s, v28.4s -add v29.4s, v29.4s, v30.4s -add v17.4s, v17.4s, v27.4s -add v9.4s, v9.4s, v20.4s -ldr q20, [x0, #784] -ldr q27, [x0, #848] -ldr q30, [x0, #912] -ldr q28, [x0, #976] -sqrdmulh v0.4S, v8.4S, v5.s[0] -sqrdmulh v22.4S, v12.4S, v26.s[0] -sqrdmulh v19.4S, v6.4S, v15.s[0] -sqrdmulh v10.4S, v1.4S, v16.s[0] -mul v8.4S, v8.4S,v4.s[0] -mul v12.4S, v12.4S,v7.s[0] -mul v6.4S, v6.4S,v3.s[0] -mul v1.4S, v1.4S,v2.s[0] -mla v8.4S, v0.4S, v31.s[0] -mla v12.4S, v22.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -mla v1.4S, v10.4S, v31.s[0] -sub v10.4s, v20.4s, v8.4s -sub v19.4s, v27.4s, v12.4s -sub v22.4s, v30.4s, v6.4s -sub v0.4s, v28.4s, v1.4s -add v20.4s, v20.4s, v8.4s -add v27.4s, v27.4s, v12.4s -add v30.4s, v30.4s, v6.4s -add v28.4s, v28.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v5.s[1] -sqrdmulh v6.4S, v27.4S, v26.s[1] -sqrdmulh v12.4S, v30.4S, v15.s[1] -sqrdmulh v8.4S, v28.4S, v16.s[1] -mul v20.4S, v20.4S,v4.s[1] -mul v27.4S, v27.4S,v7.s[1] -mul v30.4S, v30.4S,v3.s[1] -mul v28.4S, v28.4S,v2.s[1] -mla v20.4S, v1.4S, v31.s[0] -mla v27.4S, v6.4S, v31.s[0] -mla v30.4S, v12.4S, v31.s[0] -mla v28.4S, v8.4S, v31.s[0] -sub v8.4s, v18.4s, v20.4s -sub v12.4s, v29.4s, v27.4s -sub v6.4s, v17.4s, v30.4s -sub v1.4s, v9.4s, v28.4s -add v18.4s, v18.4s, v20.4s -add v29.4s, v29.4s, v27.4s -add v17.4s, v17.4s, v30.4s -add v9.4s, v9.4s, v28.4s -sqrdmulh v28.4S, v10.4S, v5.s[2] -sqrdmulh v30.4S, v19.4S, v26.s[2] -sqrdmulh v27.4S, v22.4S, v15.s[2] -sqrdmulh v20.4S, v0.4S, v16.s[2] -str q18, [x0, #768] -str q8, [x0, #784] -mul v10.4S, v10.4S,v4.s[2] -mul v19.4S, v19.4S,v7.s[2] -mul v22.4S, v22.4S,v3.s[2] -mul v0.4S, v0.4S,v2.s[2] -str q29, [x0, #832] -str q12, [x0, #848] -mla v10.4S, v28.4S, v31.s[0] -mla v19.4S, v30.4S, v31.s[0] -mla v22.4S, v27.4S, v31.s[0] -mla v0.4S, v20.4S, v31.s[0] -str q17, [x0, #896] -str q6, [x0, #912] -sub v6.4s, v21.4s, v10.4s -sub v17.4s, v11.4s, v19.4s -sub v20.4s, v13.4s, v22.4s -sub v27.4s, v14.4s, v0.4s -str q9, [x0, #960] -str q1, [x0, #976] -add v21.4s, v21.4s, v10.4s -add v11.4s, v11.4s, v19.4s -add v13.4s, v13.4s, v22.4s -add v14.4s, v14.4s, v0.4s -str q21, [x0, #800] -str q11, [x0, #864] -str q13, [x0, #928] -str q14, [x0, #992] -str q6, [x0, #816] -str q17, [x0, #880] -str q20, [x0, #944] -str q27, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s deleted file mode 100644 index 1381d5b..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_4.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4 -.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4 -ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4: -_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_4: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x17, #+128] -ldr q5, [x17, #+144] -ldr q6, [x0, #32] -sqrdmulh v7.4S, v6.4S, v5.s[0] -mul v6.4S, v6.4S,v4.s[0] -ldr q15, [x0, #48] -sqrdmulh v10.4S, v15.4S, v5.s[0] -mul v15.4S, v15.4S,v4.s[0] -ldr q2, [x17, #+160] -ldr q16, [x17, #+176] -ldr q22, [x0, #96] -sqrdmulh v13.4S, v22.4S, v16.s[0] -mul v22.4S, v22.4S,v2.s[0] -ldr q11, [x0, #112] -sqrdmulh v21.4S, v11.4S, v16.s[0] -mul v11.4S, v11.4S,v2.s[0] -ldr q14, [x0, #160] -ldr q0, [x17, #+192] -ldr q19, [x17, #+208] -mla v6.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v14.4S, v19.s[0] -ldr q17, [x0, #176] -mla v15.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v17.4S, v19.s[0] -ldr q20, [x0, #224] -ldr q3, [x17, #+224] -ldr q1, [x17, #+240] -mla v22.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v20.4S, v1.s[0] -ldr q9, [x0, #240] -mla v11.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v9.4S, v1.s[0] -ldr q12, [x0, #128] -ldr q8, [x0, #0] -mul v14.4S, v14.4S,v0.s[0] -sub v18.4s, v8.4s, v6.4s -mul v17.4S, v17.4S,v0.s[0] -add v8.4s, v8.4s, v6.4s -ldr q6, [x0, #144] -ldr q30, [x0, #16] -mla v14.4S, v7.4S, v31.s[0] -sub v7.4s, v30.4s, v15.4s -mla v17.4S, v10.4S, v31.s[0] -add v30.4s, v30.4s, v15.4s -ldr q15, [x0, #192] -ldr q10, [x0, #64] -mul v20.4S, v20.4S,v3.s[0] -sub v29.4s, v10.4s, v22.4s -mul v9.4S, v9.4S,v3.s[0] -add v10.4s, v10.4s, v22.4s -ldr q22, [x0, #208] -ldr q28, [x0, #80] -mla v20.4S, v13.4S, v31.s[0] -sub v13.4s, v28.4s, v11.4s -mla v9.4S, v21.4S, v31.s[0] -add v28.4s, v28.4s, v11.4s -sqrdmulh v11.4S, v30.4S, v5.s[1] -mul v30.4S, v30.4S,v4.s[1] -sqrdmulh v21.4S, v7.4S, v5.s[2] -sub v27.4s, v12.4s, v14.4s -mul v7.4S, v7.4S,v4.s[2] -add v12.4s, v12.4s, v14.4s -sqrdmulh v5.4S, v28.4S, v16.s[1] -sub v4.4s, v6.4s, v17.4s -mul v28.4S, v28.4S,v2.s[1] -add v6.4s, v6.4s, v17.4s -sqrdmulh v17.4S, v13.4S, v16.s[2] -sub v14.4s, v15.4s, v20.4s -mul v13.4S, v13.4S,v2.s[2] -add v15.4s, v15.4s, v20.4s -mla v30.4S, v11.4S, v31.s[0] -sub v11.4s, v22.4s, v9.4s -sqrdmulh v16.4S, v6.4S, v19.s[1] -add v22.4s, v22.4s, v9.4s -mla v7.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v4.4S, v19.s[2] -mla v28.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v22.4S, v1.s[1] -mla v13.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v11.4S, v1.s[2] -mul v6.4S, v6.4S,v0.s[1] -sub v9.4s, v8.4s, v30.4s -mul v4.4S, v4.4S,v0.s[2] -add v8.4s, v8.4s, v30.4s -str q9, [x0, #16] -str q8, [x0, #0] -mla v6.4S, v16.4S, v31.s[0] -sub v16.4s, v18.4s, v7.4s -mla v4.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q16, [x0, #48] -str q18, [x0, #32] -mul v22.4S, v22.4S,v3.s[1] -sub v19.4s, v10.4s, v28.4s -mul v11.4S, v11.4S,v3.s[2] -add v10.4s, v10.4s, v28.4s -str q19, [x0, #80] -str q10, [x0, #64] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v13.4s -mla v11.4S, v17.4S, v31.s[0] -add v29.4s, v29.4s, v13.4s -str q5, [x0, #112] -str q29, [x0, #96] -ldr q1, [x17, #+256] -ldr q3, [x17, #+272] -ldr q29, [x0, #288] -sqrdmulh v5.4S, v29.4S, v3.s[0] -sub v13.4s, v12.4s, v6.4s -str q13, [x0, #144] -mul v29.4S, v29.4S,v1.s[0] -add v12.4s, v12.4s, v6.4s -str q12, [x0, #128] -ldr q12, [x0, #304] -sqrdmulh v6.4S, v12.4S, v3.s[0] -sub v13.4s, v27.4s, v4.4s -mul v12.4S, v12.4S,v1.s[0] -add v27.4s, v27.4s, v4.4s -str q13, [x0, #176] -str q27, [x0, #160] -ldr q27, [x17, #+288] -ldr q13, [x17, #+304] -ldr q4, [x0, #352] -sqrdmulh v17.4S, v4.4S, v13.s[0] -sub v10.4s, v15.4s, v22.4s -mul v4.4S, v4.4S,v27.s[0] -add v15.4s, v15.4s, v22.4s -str q10, [x0, #208] -str q15, [x0, #192] -ldr q15, [x0, #368] -sqrdmulh v10.4S, v15.4S, v13.s[0] -sub v22.4s, v14.4s, v11.4s -mul v15.4S, v15.4S,v27.s[0] -add v14.4s, v14.4s, v11.4s -str q22, [x0, #240] -str q14, [x0, #224] -ldr q14, [x0, #416] -ldr q22, [x17, #+320] -ldr q11, [x17, #+336] -mla v29.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v14.4S, v11.s[0] -ldr q19, [x0, #432] -mla v12.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v19.4S, v11.s[0] -ldr q28, [x0, #480] -ldr q0, [x17, #+352] -ldr q18, [x17, #+368] -mla v4.4S, v17.4S, v31.s[0] -sqrdmulh v17.4S, v28.4S, v18.s[0] -ldr q16, [x0, #496] -mla v15.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v16.4S, v18.s[0] -ldr q7, [x0, #384] -ldr q21, [x0, #256] -mul v14.4S, v14.4S,v22.s[0] -sub v8.4s, v21.4s, v29.4s -mul v19.4S, v19.4S,v22.s[0] -add v21.4s, v21.4s, v29.4s -ldr q29, [x0, #400] -ldr q9, [x0, #272] -mla v14.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v12.4s -mla v19.4S, v6.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -ldr q12, [x0, #448] -ldr q6, [x0, #320] -mul v28.4S, v28.4S,v0.s[0] -sub v30.4s, v6.4s, v4.4s -mul v16.4S, v16.4S,v0.s[0] -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #464] -ldr q2, [x0, #336] -mla v28.4S, v17.4S, v31.s[0] -sub v17.4s, v2.4s, v15.4s -mla v16.4S, v10.4S, v31.s[0] -add v2.4s, v2.4s, v15.4s -sqrdmulh v15.4S, v9.4S, v3.s[1] -mul v9.4S, v9.4S,v1.s[1] -sqrdmulh v10.4S, v5.4S, v3.s[2] -sub v20.4s, v7.4s, v14.4s -mul v5.4S, v5.4S,v1.s[2] -add v7.4s, v7.4s, v14.4s -sqrdmulh v3.4S, v2.4S, v13.s[1] -sub v1.4s, v29.4s, v19.4s -mul v2.4S, v2.4S,v27.s[1] -add v29.4s, v29.4s, v19.4s -sqrdmulh v19.4S, v17.4S, v13.s[2] -sub v14.4s, v12.4s, v28.4s -mul v17.4S, v17.4S,v27.s[2] -add v12.4s, v12.4s, v28.4s -mla v9.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v16.4s -sqrdmulh v13.4S, v29.4S, v11.s[1] -add v4.4s, v4.4s, v16.4s -mla v5.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v1.4S, v11.s[2] -mla v2.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v4.4S, v18.s[1] -mla v17.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v15.4S, v18.s[2] -mul v29.4S, v29.4S,v22.s[1] -sub v16.4s, v21.4s, v9.4s -mul v1.4S, v1.4S,v22.s[2] -add v21.4s, v21.4s, v9.4s -str q16, [x0, #272] -str q21, [x0, #256] -mla v29.4S, v13.4S, v31.s[0] -sub v13.4s, v8.4s, v5.4s -mla v1.4S, v10.4S, v31.s[0] -add v8.4s, v8.4s, v5.4s -str q13, [x0, #304] -str q8, [x0, #288] -mul v4.4S, v4.4S,v0.s[1] -sub v11.4s, v6.4s, v2.4s -mul v15.4S, v15.4S,v0.s[2] -add v6.4s, v6.4s, v2.4s -str q11, [x0, #336] -str q6, [x0, #320] -mla v4.4S, v3.4S, v31.s[0] -sub v3.4s, v30.4s, v17.4s -mla v15.4S, v19.4S, v31.s[0] -add v30.4s, v30.4s, v17.4s -str q3, [x0, #368] -str q30, [x0, #352] -ldr q18, [x17, #+384] -ldr q0, [x17, #+400] -ldr q30, [x0, #544] -sqrdmulh v3.4S, v30.4S, v0.s[0] -sub v17.4s, v7.4s, v29.4s -str q17, [x0, #400] -mul v30.4S, v30.4S,v18.s[0] -add v7.4s, v7.4s, v29.4s -str q7, [x0, #384] -ldr q7, [x0, #560] -sqrdmulh v29.4S, v7.4S, v0.s[0] -sub v17.4s, v20.4s, v1.4s -mul v7.4S, v7.4S,v18.s[0] -add v20.4s, v20.4s, v1.4s -str q17, [x0, #432] -str q20, [x0, #416] -ldr q20, [x17, #+416] -ldr q17, [x17, #+432] -ldr q1, [x0, #608] -sqrdmulh v19.4S, v1.4S, v17.s[0] -sub v6.4s, v12.4s, v4.4s -mul v1.4S, v1.4S,v20.s[0] -add v12.4s, v12.4s, v4.4s -str q6, [x0, #464] -str q12, [x0, #448] -ldr q12, [x0, #624] -sqrdmulh v6.4S, v12.4S, v17.s[0] -sub v4.4s, v14.4s, v15.4s -mul v12.4S, v12.4S,v20.s[0] -add v14.4s, v14.4s, v15.4s -str q4, [x0, #496] -str q14, [x0, #480] -ldr q14, [x0, #672] -ldr q4, [x17, #+448] -ldr q15, [x17, #+464] -mla v30.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v14.4S, v15.s[0] -ldr q11, [x0, #688] -mla v7.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v11.4S, v15.s[0] -ldr q2, [x0, #736] -ldr q22, [x17, #+480] -ldr q8, [x17, #+496] -mla v1.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v2.4S, v8.s[0] -ldr q13, [x0, #752] -mla v12.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v13.4S, v8.s[0] -ldr q5, [x0, #640] -ldr q10, [x0, #512] -mul v14.4S, v14.4S,v4.s[0] -sub v21.4s, v10.4s, v30.4s -mul v11.4S, v11.4S,v4.s[0] -add v10.4s, v10.4s, v30.4s -ldr q30, [x0, #656] -ldr q16, [x0, #528] -mla v14.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v7.4s -mla v11.4S, v29.4S, v31.s[0] -add v16.4s, v16.4s, v7.4s -ldr q7, [x0, #704] -ldr q29, [x0, #576] -mul v2.4S, v2.4S,v22.s[0] -sub v9.4s, v29.4s, v1.4s -mul v13.4S, v13.4S,v22.s[0] -add v29.4s, v29.4s, v1.4s -ldr q1, [x0, #720] -ldr q27, [x0, #592] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v27.4s, v12.4s -mla v13.4S, v6.4S, v31.s[0] -add v27.4s, v27.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v0.s[1] -mul v16.4S, v16.4S,v18.s[1] -sqrdmulh v6.4S, v3.4S, v0.s[2] -sub v28.4s, v5.4s, v14.4s -mul v3.4S, v3.4S,v18.s[2] -add v5.4s, v5.4s, v14.4s -sqrdmulh v0.4S, v27.4S, v17.s[1] -sub v18.4s, v30.4s, v11.4s -mul v27.4S, v27.4S,v20.s[1] -add v30.4s, v30.4s, v11.4s -sqrdmulh v11.4S, v19.4S, v17.s[2] -sub v14.4s, v7.4s, v2.4s -mul v19.4S, v19.4S,v20.s[2] -add v7.4s, v7.4s, v2.4s -mla v16.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v13.4s -sqrdmulh v17.4S, v30.4S, v15.s[1] -add v1.4s, v1.4s, v13.4s -mla v3.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v18.4S, v15.s[2] -mla v27.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v1.4S, v8.s[1] -mla v19.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v12.4S, v8.s[2] -mul v30.4S, v30.4S,v4.s[1] -sub v13.4s, v10.4s, v16.4s -mul v18.4S, v18.4S,v4.s[2] -add v10.4s, v10.4s, v16.4s -str q13, [x0, #528] -str q10, [x0, #512] -mla v30.4S, v17.4S, v31.s[0] -sub v17.4s, v21.4s, v3.4s -mla v18.4S, v6.4S, v31.s[0] -add v21.4s, v21.4s, v3.4s -str q17, [x0, #560] -str q21, [x0, #544] -mul v1.4S, v1.4S,v22.s[1] -sub v15.4s, v29.4s, v27.4s -mul v12.4S, v12.4S,v22.s[2] -add v29.4s, v29.4s, v27.4s -str q15, [x0, #592] -str q29, [x0, #576] -mla v1.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v19.4s -mla v12.4S, v11.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -str q0, [x0, #624] -str q9, [x0, #608] -ldr q8, [x17, #+512] -ldr q22, [x17, #+528] -ldr q9, [x0, #800] -sqrdmulh v0.4S, v9.4S, v22.s[0] -sub v19.4s, v5.4s, v30.4s -str q19, [x0, #656] -mul v9.4S, v9.4S,v8.s[0] -add v5.4s, v5.4s, v30.4s -str q5, [x0, #640] -ldr q5, [x0, #816] -sqrdmulh v30.4S, v5.4S, v22.s[0] -sub v19.4s, v28.4s, v18.4s -mul v5.4S, v5.4S,v8.s[0] -add v28.4s, v28.4s, v18.4s -str q19, [x0, #688] -str q28, [x0, #672] -ldr q28, [x17, #+544] -ldr q19, [x17, #+560] -ldr q18, [x0, #864] -sqrdmulh v11.4S, v18.4S, v19.s[0] -sub v29.4s, v7.4s, v1.4s -mul v18.4S, v18.4S,v28.s[0] -add v7.4s, v7.4s, v1.4s -str q29, [x0, #720] -str q7, [x0, #704] -ldr q7, [x0, #880] -sqrdmulh v29.4S, v7.4S, v19.s[0] -sub v1.4s, v14.4s, v12.4s -mul v7.4S, v7.4S,v28.s[0] -add v14.4s, v14.4s, v12.4s -str q1, [x0, #752] -str q14, [x0, #736] -ldr q14, [x0, #928] -ldr q1, [x17, #+576] -ldr q12, [x17, #+592] -mla v9.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v14.4S, v12.s[0] -ldr q15, [x0, #944] -mla v5.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v15.4S, v12.s[0] -ldr q27, [x0, #992] -ldr q4, [x17, #+608] -ldr q21, [x17, #+624] -mla v18.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v27.4S, v21.s[0] -ldr q17, [x0, #1008] -mla v7.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v17.4S, v21.s[0] -ldr q3, [x0, #896] -ldr q6, [x0, #768] -mul v14.4S, v14.4S,v1.s[0] -sub v10.4s, v6.4s, v9.4s -mul v15.4S, v15.4S,v1.s[0] -add v6.4s, v6.4s, v9.4s -ldr q9, [x0, #912] -ldr q13, [x0, #784] -mla v14.4S, v0.4S, v31.s[0] -sub v0.4s, v13.4s, v5.4s -mla v15.4S, v30.4S, v31.s[0] -add v13.4s, v13.4s, v5.4s -ldr q5, [x0, #960] -ldr q30, [x0, #832] -mul v27.4S, v27.4S,v4.s[0] -sub v16.4s, v30.4s, v18.4s -mul v17.4S, v17.4S,v4.s[0] -add v30.4s, v30.4s, v18.4s -ldr q18, [x0, #976] -ldr q20, [x0, #848] -mla v27.4S, v11.4S, v31.s[0] -sub v11.4s, v20.4s, v7.4s -mla v17.4S, v29.4S, v31.s[0] -add v20.4s, v20.4s, v7.4s -sqrdmulh v7.4S, v13.4S, v22.s[1] -mul v13.4S, v13.4S,v8.s[1] -sqrdmulh v29.4S, v0.4S, v22.s[2] -sub v2.4s, v3.4s, v14.4s -mul v0.4S, v0.4S,v8.s[2] -add v3.4s, v3.4s, v14.4s -sqrdmulh v22.4S, v20.4S, v19.s[1] -sub v8.4s, v9.4s, v15.4s -mul v20.4S, v20.4S,v28.s[1] -add v9.4s, v9.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v19.s[2] -sub v14.4s, v5.4s, v27.4s -mul v11.4S, v11.4S,v28.s[2] -add v5.4s, v5.4s, v27.4s -mla v13.4S, v7.4S, v31.s[0] -sub v7.4s, v18.4s, v17.4s -sqrdmulh v19.4S, v9.4S, v12.s[1] -add v18.4s, v18.4s, v17.4s -mla v0.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v8.4S, v12.s[2] -mla v20.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v18.4S, v21.s[1] -mla v11.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v7.4S, v21.s[2] -mul v9.4S, v9.4S,v1.s[1] -sub v17.4s, v6.4s, v13.4s -mul v8.4S, v8.4S,v1.s[2] -add v6.4s, v6.4s, v13.4s -str q17, [x0, #784] -str q6, [x0, #768] -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v10.4s, v0.4s -mla v8.4S, v29.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -str q19, [x0, #816] -str q10, [x0, #800] -mul v18.4S, v18.4S,v4.s[1] -sub v12.4s, v30.4s, v20.4s -mul v7.4S, v7.4S,v4.s[2] -add v30.4s, v30.4s, v20.4s -str q12, [x0, #848] -str q30, [x0, #832] -mla v18.4S, v22.4S, v31.s[0] -sub v22.4s, v16.4s, v11.4s -mla v7.4S, v15.4S, v31.s[0] -add v16.4s, v16.4s, v11.4s -str q22, [x0, #880] -str q16, [x0, #864] -sub v21.4s, v3.4s, v9.4s -str q21, [x0, #912] -add v3.4s, v3.4s, v9.4s -str q3, [x0, #896] -sub v3.4s, v2.4s, v8.4s -add v2.4s, v2.4s, v8.4s -str q3, [x0, #944] -str q2, [x0, #928] -sub v2.4s, v5.4s, v18.4s -add v5.4s, v5.4s, v18.4s -str q2, [x0, #976] -str q5, [x0, #960] -sub v5.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -str q5, [x0, #1008] -str q14, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s deleted file mode 100644 index d9c302a..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_3_z4_5.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5 -.global _ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5 -ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5: -_ntt_u32_incomplete_neon_asm_var_4_2_3_z4_5: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #800] -ldr q21, [x0, #864] -ldr q20, [x0, #928] -ldr q19, [x0, #992] -ldr q18, [x0, #288] -ldr q17, [x0, #352] -ldr q16, [x0, #416] -ldr q3, [x0, #480] -sqrdmulh v2.4S, v22.4S, v29.s[0] -ldr q1, [x0, #544] -mul v22.4S, v22.4S,v30.s[0] -ldr q0, [x0, #608] -sqrdmulh v15.4S, v21.4S, v29.s[0] -ldr q14, [x0, #672] -mul v21.4S, v21.4S,v30.s[0] -ldr q13, [x0, #736] -mla v22.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q12, [x0, #32] -sub v11.4s, v18.4s, v22.4s -mla v21.4S, v15.4S, v31.s[0] -add v18.4s, v18.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q15, [x0, #96] -sub v10.4s, v17.4s, v21.4s -mla v20.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v29.s[0] -ldr q2, [x0, #160] -mul v1.4S, v1.4S,v30.s[0] -sub v9.4s, v16.4s, v20.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v29.s[0] -ldr q22, [x0, #224] -mul v0.4S, v0.4S,v30.s[0] -sub v8.4s, v3.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v12.4s, v1.4s -mla v0.4S, v20.4S, v31.s[0] -add v12.4s, v12.4s, v1.4s -sqrdmulh v1.4S, v13.4S, v29.s[0] -mul v13.4S, v13.4S,v30.s[0] -sub v20.4s, v15.4s, v0.4s -mla v14.4S, v19.4S, v31.s[0] -add v15.4s, v15.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v19.4s, v2.4s, v14.4s -mla v13.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v1.4s, v22.4s, v13.4s -mla v16.4S, v0.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v0.4s, v2.4s, v16.4s -mla v3.4S, v14.4S, v31.s[0] -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v14.4s, v22.4s, v3.4s -mla v18.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v13.4s, v12.4s, v18.4s -mla v17.4S, v16.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -sub v16.4s, v15.4s, v17.4s -mla v9.4S, v3.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v3.4s, v19.4s, v9.4s -mla v8.4S, v18.4S, v31.s[0] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v8.4s -mla v11.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v8.4s -sqrdmulh v8.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v17.4s, v21.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v21.4s, v21.4s, v11.4s -sqrdmulh v11.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v9.4s, v20.4s, v10.4s -mla v2.4S, v8.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -sqrdmulh v10.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v8.4s, v12.4s, v2.4s -mla v22.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v2.4s -sqrdmulh v2.4S, v14.4S, v27.s[1] -mul v14.4S, v14.4S,v28.s[1] -sub v11.4s, v15.4s, v22.4s -mla v0.4S, v10.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v10.4s, v13.4s, v0.4s -mla v14.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v0.4s -sqrdmulh v0.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v2.4s, v16.4s, v14.4s -mla v19.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v22.4s, v21.4s, v19.4s -mla v1.4S, v0.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v0.4s, v20.4s, v1.4s -mla v3.4S, v14.4S, v31.s[0] -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v14.4s, v17.4s, v3.4s -mla v18.4S, v19.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v11.4S, v25.s[1] -mul v11.4S, v11.4S,v26.s[1] -sub v19.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v1.4s, v12.4s, v15.4s -mla v11.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v25.s[3] -mul v2.4S, v2.4S,v26.s[3] -sub v3.4s, v8.4s, v11.4s -mla v16.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v11.4s -str q12, [x0, #32] -sqrdmulh v12.4S, v20.4S, v23.s[0] -str q1, [x0, #96] -mul v20.4S, v20.4S,v24.s[0] -ldr q1, [x0, #816] -sub v11.4s, v13.4s, v16.4s -ldr q18, [x0, #880] -mla v2.4S, v15.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -str q8, [x0, #160] -sqrdmulh v8.4S, v0.4S, v23.s[1] -str q3, [x0, #224] -mul v0.4S, v0.4S,v24.s[1] -ldr q3, [x0, #944] -sub v16.4s, v10.4s, v2.4s -ldr q15, [x0, #1008] -mla v20.4S, v12.4S, v31.s[0] -add v10.4s, v10.4s, v2.4s -str q13, [x0, #288] -sqrdmulh v13.4S, v9.4S, v23.s[2] -str q11, [x0, #352] -mul v9.4S, v9.4S,v24.s[2] -ldr q11, [x0, #304] -sub v2.4s, v21.4s, v20.4s -ldr q12, [x0, #368] -mla v0.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v20.4s -str q10, [x0, #416] -sqrdmulh v10.4S, v19.4S, v23.s[3] -str q16, [x0, #480] -mul v19.4S, v19.4S,v24.s[3] -ldr q16, [x0, #432] -sub v20.4s, v22.4s, v0.4s -ldr q8, [x0, #496] -mla v9.4S, v13.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -str q21, [x0, #544] -sqrdmulh v21.4S, v1.4S, v29.s[0] -str q2, [x0, #608] -ldr q2, [x0, #560] -mul v1.4S, v1.4S,v30.s[0] -ldr q0, [x0, #624] -sub v13.4s, v17.4s, v9.4s -mla v19.4S, v10.4S, v31.s[0] -add v17.4s, v17.4s, v9.4s -str q22, [x0, #672] -sqrdmulh v22.4S, v18.4S, v29.s[0] -str q20, [x0, #736] -ldr q20, [x0, #688] -mul v18.4S, v18.4S,v30.s[0] -ldr q9, [x0, #752] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v21.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -str q17, [x0, #800] -sqrdmulh v17.4S, v3.4S, v29.s[0] -str q13, [x0, #864] -mul v3.4S, v3.4S,v30.s[0] -ldr q13, [x0, #48] -sub v19.4s, v11.4s, v1.4s -mla v18.4S, v22.4S, v31.s[0] -add v11.4s, v11.4s, v1.4s -str q14, [x0, #928] -sqrdmulh v14.4S, v15.4S, v29.s[0] -str q10, [x0, #992] -mul v15.4S, v15.4S,v30.s[0] -ldr q10, [x0, #112] -sub v1.4s, v12.4s, v18.4s -mla v3.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v29.s[0] -ldr q17, [x0, #176] -mul v2.4S, v2.4S,v30.s[0] -sub v22.4s, v16.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v0.4S, v29.s[0] -ldr q14, [x0, #240] -mul v0.4S, v0.4S,v30.s[0] -sub v21.4s, v8.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -sub v18.4s, v13.4s, v2.4s -mla v0.4S, v3.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v3.4s, v10.4s, v0.4s -mla v20.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v15.4s, v17.4s, v20.4s -mla v9.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -sub v2.4s, v14.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v29.s[1] -mul v11.4S, v11.4S,v30.s[1] -sub v0.4s, v17.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v16.4s -sqrdmulh v16.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -sub v20.4s, v14.4s, v8.4s -mla v11.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v9.4s, v13.4s, v11.4s -mla v12.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v16.4s, v10.4s, v12.4s -mla v22.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v12.4s -sqrdmulh v12.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v8.4s, v15.4s, v22.4s -mla v21.4S, v11.4S, v31.s[0] -add v15.4s, v15.4s, v22.4s -sqrdmulh v22.4S, v1.4S, v29.s[2] -mul v1.4S, v1.4S,v30.s[2] -sub v11.4s, v2.4s, v21.4s -mla v19.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v12.4s, v18.4s, v19.4s -mla v1.4S, v22.4S, v31.s[0] -add v18.4s, v18.4s, v19.4s -sqrdmulh v19.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v22.4s, v3.4s, v1.4s -mla v17.4S, v21.4S, v31.s[0] -add v3.4s, v3.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v21.4s, v13.4s, v17.4s -mla v14.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v17.4s -sqrdmulh v17.4S, v20.4S, v27.s[1] -mul v20.4S, v20.4S,v28.s[1] -sub v19.4s, v10.4s, v14.4s -mla v0.4S, v1.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[2] -mul v15.4S, v15.4S,v28.s[2] -sub v1.4s, v9.4s, v0.4s -mla v20.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v0.4s -sqrdmulh v0.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v17.4s, v16.4s, v20.4s -mla v15.4S, v14.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v0.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v27.s[3] -mul v11.4S, v11.4S,v28.s[3] -sub v0.4s, v3.4s, v2.4s -mla v8.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v10.4S, v25.s[0] -mul v10.4S, v10.4S,v26.s[0] -sub v20.4s, v12.4s, v8.4s -mla v11.4S, v15.4S, v31.s[0] -add v12.4s, v12.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v15.4s, v22.4s, v11.4s -mla v10.4S, v2.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -sqrdmulh v11.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v2.4s, v13.4s, v10.4s -mla v19.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v25.s[3] -mul v17.4S, v17.4S,v26.s[3] -sub v8.4s, v21.4s, v19.4s -mla v16.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v19.4s -str q13, [x0, #48] -sqrdmulh v13.4S, v3.4S, v23.s[0] -str q2, [x0, #112] -mul v3.4S, v3.4S,v24.s[0] -ldr q2, [x0, #768] -sub v19.4s, v9.4s, v16.4s -ldr q11, [x0, #832] -mla v17.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -str q21, [x0, #176] -sqrdmulh v21.4S, v0.4S, v23.s[1] -str q8, [x0, #240] -mul v0.4S, v0.4S,v24.s[1] -ldr q8, [x0, #896] -sub v16.4s, v1.4s, v17.4s -ldr q10, [x0, #960] -mla v3.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -str q9, [x0, #304] -sqrdmulh v9.4S, v22.4S, v23.s[2] -str q19, [x0, #368] -mul v22.4S, v22.4S,v24.s[2] -ldr q19, [x0, #256] -sub v17.4s, v18.4s, v3.4s -ldr q13, [x0, #320] -mla v0.4S, v21.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -str q1, [x0, #432] -sqrdmulh v1.4S, v15.4S, v23.s[3] -str q16, [x0, #496] -mul v15.4S, v15.4S,v24.s[3] -ldr q16, [x0, #384] -sub v3.4s, v14.4s, v0.4s -ldr q21, [x0, #448] -mla v22.4S, v9.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -str q18, [x0, #560] -sqrdmulh v18.4S, v2.4S, v29.s[0] -str q17, [x0, #624] -ldr q17, [x0, #512] -mul v2.4S, v2.4S,v30.s[0] -ldr q0, [x0, #576] -sub v9.4s, v12.4s, v22.4s -mla v15.4S, v1.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -str q14, [x0, #688] -sqrdmulh v14.4S, v11.4S, v29.s[0] -str q3, [x0, #752] -ldr q3, [x0, #640] -mul v11.4S, v11.4S,v30.s[0] -ldr q22, [x0, #704] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v18.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -str q12, [x0, #816] -sqrdmulh v12.4S, v8.4S, v29.s[0] -str q9, [x0, #880] -mul v8.4S, v8.4S,v30.s[0] -ldr q9, [x0, #0] -sub v15.4s, v19.4s, v2.4s -mla v11.4S, v14.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -str q20, [x0, #944] -sqrdmulh v20.4S, v10.4S, v29.s[0] -str q1, [x0, #1008] -mul v10.4S, v10.4S,v30.s[0] -ldr q1, [x0, #64] -sub v2.4s, v13.4s, v11.4s -mla v8.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v11.4s -sqrdmulh v11.4S, v17.4S, v29.s[0] -ldr q12, [x0, #128] -mul v17.4S, v17.4S,v30.s[0] -sub v14.4s, v16.4s, v8.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v0.4S, v29.s[0] -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v30.s[0] -sub v18.4s, v21.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v11.4s, v9.4s, v17.4s -mla v0.4S, v8.4S, v31.s[0] -add v9.4s, v9.4s, v17.4s -sqrdmulh v17.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v8.4s, v1.4s, v0.4s -mla v3.4S, v10.4S, v31.s[0] -add v1.4s, v1.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v10.4s, v12.4s, v3.4s -mla v22.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v17.4s, v20.4s, v22.4s -mla v16.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v22.4s -sqrdmulh v22.4S, v19.4S, v29.s[1] -mul v19.4S, v19.4S,v30.s[1] -sub v0.4s, v12.4s, v16.4s -mla v21.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v3.4s, v20.4s, v21.4s -mla v19.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v22.4s, v9.4s, v19.4s -mla v13.4S, v16.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v18.4S, v29.s[2] -mul v18.4S, v18.4S,v30.s[2] -sub v16.4s, v1.4s, v13.4s -mla v14.4S, v21.4S, v31.s[0] -add v1.4s, v1.4s, v13.4s -sqrdmulh v13.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v21.4s, v10.4s, v14.4s -mla v18.4S, v19.4S, v31.s[0] -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v19.4s, v17.4s, v18.4s -mla v15.4S, v13.4S, v31.s[0] -add v17.4s, v17.4s, v18.4s -sqrdmulh v18.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v13.4s, v11.4s, v15.4s -mla v2.4S, v14.4S, v31.s[0] -add v11.4s, v11.4s, v15.4s -sqrdmulh v15.4S, v20.4S, v27.s[0] -mul v20.4S, v20.4S,v28.s[0] -sub v14.4s, v8.4s, v2.4s -mla v12.4S, v18.4S, v31.s[0] -add v8.4s, v8.4s, v2.4s -sqrdmulh v2.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v18.4s, v9.4s, v12.4s -mla v20.4S, v15.4S, v31.s[0] -add v9.4s, v9.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v27.s[1] -mul v3.4S, v3.4S,v28.s[1] -sub v15.4s, v1.4s, v20.4s -mla v0.4S, v2.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -sub v2.4s, v22.4s, v0.4s -mla v3.4S, v12.4S, v31.s[0] -add v22.4s, v22.4s, v0.4s -sqrdmulh v0.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v12.4s, v16.4s, v3.4s -mla v10.4S, v20.4S, v31.s[0] -add v16.4s, v16.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v11.4s, v10.4s -mla v17.4S, v0.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -sqrdmulh v10.4S, v19.4S, v27.s[3] -mul v19.4S, v19.4S,v28.s[3] -sub v0.4s, v8.4s, v17.4s -mla v21.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v17.4s -sqrdmulh v17.4S, v1.4S, v25.s[0] -mul v1.4S, v1.4S,v26.s[0] -sub v3.4s, v13.4s, v21.4s -mla v19.4S, v10.4S, v31.s[0] -add v13.4s, v13.4s, v21.4s -sqrdmulh v21.4S, v15.4S, v25.s[1] -mul v15.4S, v15.4S,v26.s[1] -sub v10.4s, v14.4s, v19.4s -mla v1.4S, v17.4S, v31.s[0] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v17.4s, v9.4s, v1.4s -mla v15.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -sub v21.4s, v18.4s, v15.4s -mla v16.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -str q9, [x0, #0] -sqrdmulh v9.4S, v8.4S, v23.s[0] -str q17, [x0, #64] -mul v8.4S, v8.4S,v24.s[0] -ldr q17, [x0, #784] -sub v15.4s, v22.4s, v16.4s -ldr q19, [x0, #848] -mla v12.4S, v1.4S, v31.s[0] -add v22.4s, v22.4s, v16.4s -str q18, [x0, #128] -sqrdmulh v18.4S, v0.4S, v23.s[1] -str q21, [x0, #192] -mul v0.4S, v0.4S,v24.s[1] -ldr q21, [x0, #912] -sub v16.4s, v2.4s, v12.4s -ldr q1, [x0, #976] -mla v8.4S, v9.4S, v31.s[0] -add v2.4s, v2.4s, v12.4s -str q22, [x0, #256] -sqrdmulh v22.4S, v14.4S, v23.s[2] -str q15, [x0, #320] -mul v14.4S, v14.4S,v24.s[2] -ldr q15, [x0, #272] -sub v12.4s, v11.4s, v8.4s -ldr q9, [x0, #336] -mla v0.4S, v18.4S, v31.s[0] -add v11.4s, v11.4s, v8.4s -str q2, [x0, #384] -sqrdmulh v2.4S, v10.4S, v23.s[3] -str q16, [x0, #448] -mul v10.4S, v10.4S,v24.s[3] -ldr q16, [x0, #400] -sub v8.4s, v20.4s, v0.4s -ldr q18, [x0, #464] -mla v14.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v0.4s -str q11, [x0, #512] -sqrdmulh v11.4S, v17.4S, v29.s[0] -str q12, [x0, #576] -ldr q12, [x0, #528] -mul v17.4S, v17.4S,v30.s[0] -ldr q0, [x0, #592] -sub v22.4s, v13.4s, v14.4s -mla v10.4S, v2.4S, v31.s[0] -add v13.4s, v13.4s, v14.4s -str q20, [x0, #640] -sqrdmulh v20.4S, v19.4S, v29.s[0] -str q8, [x0, #704] -ldr q8, [x0, #656] -mul v19.4S, v19.4S,v30.s[0] -ldr q14, [x0, #720] -sub v2.4s, v3.4s, v10.4s -mla v17.4S, v11.4S, v31.s[0] -add v3.4s, v3.4s, v10.4s -str q13, [x0, #768] -sqrdmulh v13.4S, v21.4S, v29.s[0] -str q22, [x0, #832] -mul v21.4S, v21.4S,v30.s[0] -ldr q22, [x0, #16] -sub v10.4s, v15.4s, v17.4s -mla v19.4S, v20.4S, v31.s[0] -add v15.4s, v15.4s, v17.4s -str q3, [x0, #896] -sqrdmulh v3.4S, v1.4S, v29.s[0] -str q2, [x0, #960] -mul v1.4S, v1.4S,v30.s[0] -ldr q2, [x0, #80] -sub v17.4s, v9.4s, v19.4s -mla v21.4S, v13.4S, v31.s[0] -add v9.4s, v9.4s, v19.4s -sqrdmulh v19.4S, v12.4S, v29.s[0] -ldr q13, [x0, #144] -mul v12.4S, v12.4S,v30.s[0] -sub v20.4s, v16.4s, v21.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v0.4S, v29.s[0] -ldr q3, [x0, #208] -mul v0.4S, v0.4S,v30.s[0] -sub v11.4s, v18.4s, v1.4s -mla v12.4S, v19.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v19.4s, v22.4s, v12.4s -mla v0.4S, v21.4S, v31.s[0] -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v21.4s, v2.4s, v0.4s -mla v8.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[1] -mul v16.4S, v16.4S,v30.s[1] -sub v1.4s, v13.4s, v8.4s -mla v14.4S, v12.4S, v31.s[0] -add v13.4s, v13.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v12.4s, v3.4s, v14.4s -mla v16.4S, v0.4S, v31.s[0] -add v3.4s, v3.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v0.4s, v13.4s, v16.4s -mla v18.4S, v8.4S, v31.s[0] -add v13.4s, v13.4s, v16.4s -sqrdmulh v16.4S, v9.4S, v29.s[1] -mul v9.4S, v9.4S,v30.s[1] -sub v8.4s, v3.4s, v18.4s -mla v15.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v20.4S, v29.s[2] -mul v20.4S, v20.4S,v30.s[2] -sub v14.4s, v22.4s, v15.4s -mla v9.4S, v16.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -sub v16.4s, v2.4s, v9.4s -mla v20.4S, v18.4S, v31.s[0] -add v2.4s, v2.4s, v9.4s -sqrdmulh v9.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v18.4s, v1.4s, v20.4s -mla v11.4S, v15.4S, v31.s[0] -add v1.4s, v1.4s, v20.4s -sqrdmulh v20.4S, v17.4S, v29.s[2] -mul v17.4S, v17.4S,v30.s[2] -sub v15.4s, v12.4s, v11.4s -mla v10.4S, v9.4S, v31.s[0] -add v12.4s, v12.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v27.s[0] -mul v13.4S, v13.4S,v28.s[0] -sub v9.4s, v19.4s, v10.4s -mla v17.4S, v20.4S, v31.s[0] -add v19.4s, v19.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v20.4s, v21.4s, v17.4s -mla v13.4S, v11.4S, v31.s[0] -add v21.4s, v21.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v27.s[1] -mul v0.4S, v0.4S,v28.s[1] -sub v11.4s, v22.4s, v13.4s -mla v3.4S, v10.4S, v31.s[0] -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v10.4s, v2.4s, v3.4s -mla v0.4S, v17.4S, v31.s[0] -add v2.4s, v2.4s, v3.4s -sqrdmulh v3.4S, v1.4S, v27.s[2] -mul v1.4S, v1.4S,v28.s[2] -sub v17.4s, v14.4s, v0.4s -mla v8.4S, v13.4S, v31.s[0] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[2] -mul v12.4S, v12.4S,v28.s[2] -sub v13.4s, v16.4s, v8.4s -mla v1.4S, v3.4S, v31.s[0] -add v16.4s, v16.4s, v8.4s -sqrdmulh v8.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v3.4s, v19.4s, v1.4s -mla v12.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v0.4s, v21.4s, v12.4s -mla v18.4S, v8.4S, v31.s[0] -add v21.4s, v21.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v9.4s, v18.4s -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v18.4s -sqrdmulh v18.4S, v10.4S, v25.s[1] -mul v10.4S, v10.4S,v26.s[1] -sub v1.4s, v20.4s, v15.4s -mla v2.4S, v12.4S, v31.s[0] -add v20.4s, v20.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v25.s[2] -mul v16.4S, v16.4S,v26.s[2] -sub v12.4s, v22.4s, v2.4s -mla v10.4S, v18.4S, v31.s[0] -add v22.4s, v22.4s, v2.4s -sqrdmulh v2.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -sub v18.4s, v11.4s, v10.4s -mla v16.4S, v15.4S, v31.s[0] -add v11.4s, v11.4s, v10.4s -str q22, [x0, #16] -sqrdmulh v22.4S, v21.4S, v23.s[0] -str q12, [x0, #80] -mul v21.4S, v21.4S,v24.s[0] -sub v12.4s, v14.4s, v16.4s -mla v13.4S, v2.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -str q11, [x0, #144] -sqrdmulh v11.4S, v0.4S, v23.s[1] -str q18, [x0, #208] -mul v0.4S, v0.4S,v24.s[1] -sub v18.4s, v17.4s, v13.4s -mla v21.4S, v22.4S, v31.s[0] -add v17.4s, v17.4s, v13.4s -str q14, [x0, #272] -sqrdmulh v14.4S, v20.4S, v23.s[2] -str q12, [x0, #336] -mul v20.4S, v20.4S,v24.s[2] -sub v12.4s, v19.4s, v21.4s -mla v0.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v21.4s -str q17, [x0, #400] -sqrdmulh v17.4S, v1.4S, v23.s[3] -str q18, [x0, #464] -mul v1.4S, v1.4S,v24.s[3] -sub v18.4s, v3.4s, v0.4s -mla v20.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -str q19, [x0, #528] -str q12, [x0, #592] -sub v12.4s, v9.4s, v20.4s -mla v1.4S, v17.4S, v31.s[0] -add v9.4s, v9.4s, v20.4s -str q3, [x0, #656] -str q18, [x0, #720] -sub v18.4s, v8.4s, v1.4s -add v8.4s, v8.4s, v1.4s -str q9, [x0, #784] -str q12, [x0, #848] -str q8, [x0, #912] -str q18, [x0, #976] -ldr q4, [x0, #224] -ldr q5, [x0, #160] -ldr q6, [x0, #32] -ldr q7, [x17, #+128] -ldr q15, [x17, #+144] -sqrdmulh v10.4S, v6.4S, v15.s[0] -mul v6.4S, v6.4S,v7.s[0] -ldr q2, [x0, #48] -sqrdmulh v16.4S, v2.4S, v15.s[0] -mul v2.4S, v2.4S,v7.s[0] -ldr q22, [x17, #+160] -ldr q13, [x17, #+176] -ldr q11, [x0, #96] -sqrdmulh v21.4S, v11.4S, v13.s[0] -mul v11.4S, v11.4S,v22.s[0] -ldr q14, [x0, #112] -sqrdmulh v0.4S, v14.4S, v13.s[0] -mul v14.4S, v14.4S,v22.s[0] -ldr q19, [x17, #+192] -ldr q17, [x17, #+208] -mla v6.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v5.4S, v17.s[0] -ldr q20, [x0, #176] -mla v2.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v20.4S, v17.s[0] -ldr q3, [x17, #+224] -ldr q1, [x17, #+240] -mla v11.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v4.4S, v1.s[0] -ldr q9, [x0, #240] -mla v14.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v9.4S, v1.s[0] -ldr q12, [x0, #128] -ldr q8, [x0, #0] -mul v5.4S, v5.4S,v19.s[0] -sub v18.4s, v8.4s, v6.4s -mul v20.4S, v20.4S,v19.s[0] -add v8.4s, v8.4s, v6.4s -ldr q6, [x0, #144] -ldr q30, [x0, #16] -mla v5.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v2.4s -mla v20.4S, v16.4S, v31.s[0] -add v30.4s, v30.4s, v2.4s -ldr q2, [x0, #192] -ldr q16, [x0, #64] -mul v4.4S, v4.4S,v3.s[0] -sub v29.4s, v16.4s, v11.4s -mul v9.4S, v9.4S,v3.s[0] -add v16.4s, v16.4s, v11.4s -ldr q11, [x0, #208] -ldr q28, [x0, #80] -mla v4.4S, v21.4S, v31.s[0] -mla v9.4S, v0.4S, v31.s[0] -sub v0.4s, v28.4s, v14.4s -sqrdmulh v21.4S, v30.4S, v15.s[1] -mul v30.4S, v30.4S,v7.s[1] -add v28.4s, v28.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v15.s[2] -sub v27.4s, v12.4s, v5.4s -mul v10.4S, v10.4S,v7.s[2] -add v12.4s, v12.4s, v5.4s -sqrdmulh v15.4S, v28.4S, v13.s[1] -sub v7.4s, v6.4s, v20.4s -mul v28.4S, v28.4S,v22.s[1] -add v6.4s, v6.4s, v20.4s -sqrdmulh v20.4S, v0.4S, v13.s[2] -sub v5.4s, v2.4s, v4.4s -mul v0.4S, v0.4S,v22.s[2] -add v2.4s, v2.4s, v4.4s -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v11.4s, v9.4s -ldr q13, [x0, #480] -sqrdmulh v22.4S, v6.4S, v17.s[1] -add v11.4s, v11.4s, v9.4s -mla v10.4S, v14.4S, v31.s[0] -ldr q14, [x0, #416] -sqrdmulh v9.4S, v7.4S, v17.s[2] -mla v28.4S, v15.4S, v31.s[0] -ldr q15, [x0, #288] -sqrdmulh v4.4S, v11.4S, v1.s[1] -mla v0.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -sqrdmulh v26.4S, v21.4S, v1.s[2] -ldr q25, [x17, #+272] -mul v6.4S, v6.4S,v19.s[1] -sub v24.4s, v8.4s, v30.4s -str q24, [x0, #16] -mul v7.4S, v7.4S,v19.s[2] -add v8.4s, v8.4s, v30.4s -str q8, [x0, #0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v10.4s -str q22, [x0, #48] -mla v7.4S, v9.4S, v31.s[0] -add v18.4s, v18.4s, v10.4s -str q18, [x0, #32] -mul v11.4S, v11.4S,v3.s[1] -sub v17.4s, v16.4s, v28.4s -str q17, [x0, #80] -mul v21.4S, v21.4S,v3.s[2] -add v16.4s, v16.4s, v28.4s -str q16, [x0, #64] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v0.4s -str q4, [x0, #112] -mla v21.4S, v26.4S, v31.s[0] -add v29.4s, v29.4s, v0.4s -str q29, [x0, #96] -sqrdmulh v1.4S, v15.4S, v25.s[0] -sub v3.4s, v12.4s, v6.4s -mul v15.4S, v15.4S,v20.s[0] -str q3, [x0, #144] -ldr q3, [x0, #304] -sqrdmulh v29.4S, v3.4S, v25.s[0] -add v12.4s, v12.4s, v6.4s -mul v3.4S, v3.4S,v20.s[0] -str q12, [x0, #128] -ldr q12, [x17, #+288] -ldr q6, [x17, #+304] -ldr q0, [x0, #352] -sqrdmulh v26.4S, v0.4S, v6.s[0] -sub v4.4s, v27.4s, v7.4s -mul v0.4S, v0.4S,v12.s[0] -str q4, [x0, #176] -ldr q4, [x0, #368] -sqrdmulh v16.4S, v4.4S, v6.s[0] -add v27.4s, v27.4s, v7.4s -mul v4.4S, v4.4S,v12.s[0] -str q27, [x0, #160] -ldr q27, [x17, #+320] -ldr q7, [x17, #+336] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v2.4s, v11.4s -sqrdmulh v28.4S, v14.4S, v7.s[0] -str q1, [x0, #208] -ldr q1, [x0, #432] -mla v3.4S, v29.4S, v31.s[0] -add v2.4s, v2.4s, v11.4s -sqrdmulh v11.4S, v1.4S, v7.s[0] -str q2, [x0, #192] -ldr q2, [x17, #+352] -ldr q29, [x17, #+368] -mla v0.4S, v26.4S, v31.s[0] -sub v26.4s, v5.4s, v21.4s -sqrdmulh v17.4S, v13.4S, v29.s[0] -str q26, [x0, #240] -ldr q26, [x0, #496] -mla v4.4S, v16.4S, v31.s[0] -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v26.4S, v29.s[0] -str q5, [x0, #224] -ldr q5, [x0, #384] -ldr q16, [x0, #256] -mul v14.4S, v14.4S,v27.s[0] -sub v19.4s, v16.4s, v15.4s -mul v1.4S, v1.4S,v27.s[0] -add v16.4s, v16.4s, v15.4s -ldr q15, [x0, #400] -ldr q18, [x0, #272] -mla v14.4S, v28.4S, v31.s[0] -sub v28.4s, v18.4s, v3.4s -mla v1.4S, v11.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -ldr q3, [x0, #448] -ldr q11, [x0, #320] -mul v13.4S, v13.4S,v2.s[0] -sub v10.4s, v11.4s, v0.4s -mul v26.4S, v26.4S,v2.s[0] -add v11.4s, v11.4s, v0.4s -ldr q0, [x0, #464] -ldr q9, [x0, #336] -mla v13.4S, v17.4S, v31.s[0] -mla v26.4S, v21.4S, v31.s[0] -sub v21.4s, v9.4s, v4.4s -sqrdmulh v17.4S, v18.4S, v25.s[1] -mul v18.4S, v18.4S,v20.s[1] -add v9.4s, v9.4s, v4.4s -sqrdmulh v4.4S, v28.4S, v25.s[2] -sub v22.4s, v5.4s, v14.4s -mul v28.4S, v28.4S,v20.s[2] -add v5.4s, v5.4s, v14.4s -sqrdmulh v25.4S, v9.4S, v6.s[1] -sub v20.4s, v15.4s, v1.4s -mul v9.4S, v9.4S,v12.s[1] -add v15.4s, v15.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v6.s[2] -sub v14.4s, v3.4s, v13.4s -mul v21.4S, v21.4S,v12.s[2] -add v3.4s, v3.4s, v13.4s -mla v18.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v26.4s -ldr q6, [x0, #736] -sqrdmulh v12.4S, v15.4S, v7.s[1] -add v0.4s, v0.4s, v26.4s -mla v28.4S, v4.4S, v31.s[0] -ldr q4, [x0, #672] -sqrdmulh v26.4S, v20.4S, v7.s[2] -mla v9.4S, v25.4S, v31.s[0] -ldr q25, [x0, #544] -sqrdmulh v13.4S, v0.4S, v29.s[1] -mla v21.4S, v1.4S, v31.s[0] -ldr q1, [x17, #+384] -sqrdmulh v8.4S, v17.4S, v29.s[2] -ldr q30, [x17, #+400] -mul v15.4S, v15.4S,v27.s[1] -sub v24.4s, v16.4s, v18.4s -str q24, [x0, #272] -mul v20.4S, v20.4S,v27.s[2] -add v16.4s, v16.4s, v18.4s -str q16, [x0, #256] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v19.4s, v28.4s -str q12, [x0, #304] -mla v20.4S, v26.4S, v31.s[0] -add v19.4s, v19.4s, v28.4s -str q19, [x0, #288] -mul v0.4S, v0.4S,v2.s[1] -sub v7.4s, v11.4s, v9.4s -str q7, [x0, #336] -mul v17.4S, v17.4S,v2.s[2] -add v11.4s, v11.4s, v9.4s -str q11, [x0, #320] -mla v0.4S, v13.4S, v31.s[0] -sub v13.4s, v10.4s, v21.4s -str q13, [x0, #368] -mla v17.4S, v8.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -str q10, [x0, #352] -sqrdmulh v29.4S, v25.4S, v30.s[0] -sub v2.4s, v5.4s, v15.4s -mul v25.4S, v25.4S,v1.s[0] -str q2, [x0, #400] -ldr q2, [x0, #560] -sqrdmulh v10.4S, v2.4S, v30.s[0] -add v5.4s, v5.4s, v15.4s -mul v2.4S, v2.4S,v1.s[0] -str q5, [x0, #384] -ldr q5, [x17, #+416] -ldr q15, [x17, #+432] -ldr q21, [x0, #608] -sqrdmulh v8.4S, v21.4S, v15.s[0] -sub v13.4s, v22.4s, v20.4s -mul v21.4S, v21.4S,v5.s[0] -str q13, [x0, #432] -ldr q13, [x0, #624] -sqrdmulh v11.4S, v13.4S, v15.s[0] -add v22.4s, v22.4s, v20.4s -mul v13.4S, v13.4S,v5.s[0] -str q22, [x0, #416] -ldr q22, [x17, #+448] -ldr q20, [x17, #+464] -mla v25.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v0.4s -sqrdmulh v9.4S, v4.4S, v20.s[0] -str q29, [x0, #464] -ldr q29, [x0, #688] -mla v2.4S, v10.4S, v31.s[0] -add v3.4s, v3.4s, v0.4s -sqrdmulh v0.4S, v29.4S, v20.s[0] -str q3, [x0, #448] -ldr q3, [x17, #+480] -ldr q10, [x17, #+496] -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v14.4s, v17.4s -sqrdmulh v7.4S, v6.4S, v10.s[0] -str q8, [x0, #496] -ldr q8, [x0, #752] -mla v13.4S, v11.4S, v31.s[0] -add v14.4s, v14.4s, v17.4s -sqrdmulh v17.4S, v8.4S, v10.s[0] -str q14, [x0, #480] -ldr q14, [x0, #640] -ldr q11, [x0, #512] -mul v4.4S, v4.4S,v22.s[0] -sub v27.4s, v11.4s, v25.4s -mul v29.4S, v29.4S,v22.s[0] -add v11.4s, v11.4s, v25.4s -ldr q25, [x0, #656] -ldr q19, [x0, #528] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v19.4s, v2.4s -mla v29.4S, v0.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #704] -ldr q0, [x0, #576] -mul v6.4S, v6.4S,v3.s[0] -sub v28.4s, v0.4s, v21.4s -mul v8.4S, v8.4S,v3.s[0] -add v0.4s, v0.4s, v21.4s -ldr q21, [x0, #720] -ldr q26, [x0, #592] -mla v6.4S, v7.4S, v31.s[0] -mla v8.4S, v17.4S, v31.s[0] -sub v17.4s, v26.4s, v13.4s -sqrdmulh v7.4S, v19.4S, v30.s[1] -mul v19.4S, v19.4S,v1.s[1] -add v26.4s, v26.4s, v13.4s -sqrdmulh v13.4S, v9.4S, v30.s[2] -sub v12.4s, v14.4s, v4.4s -mul v9.4S, v9.4S,v1.s[2] -add v14.4s, v14.4s, v4.4s -sqrdmulh v30.4S, v26.4S, v15.s[1] -sub v1.4s, v25.4s, v29.4s -mul v26.4S, v26.4S,v5.s[1] -add v25.4s, v25.4s, v29.4s -sqrdmulh v29.4S, v17.4S, v15.s[2] -sub v4.4s, v2.4s, v6.4s -mul v17.4S, v17.4S,v5.s[2] -add v2.4s, v2.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -sub v7.4s, v21.4s, v8.4s -ldr q15, [x0, #992] -sqrdmulh v5.4S, v25.4S, v20.s[1] -add v21.4s, v21.4s, v8.4s -mla v9.4S, v13.4S, v31.s[0] -ldr q13, [x0, #928] -sqrdmulh v8.4S, v1.4S, v20.s[2] -mla v26.4S, v30.4S, v31.s[0] -ldr q30, [x0, #800] -sqrdmulh v6.4S, v21.4S, v10.s[1] -mla v17.4S, v29.4S, v31.s[0] -ldr q29, [x17, #+512] -sqrdmulh v16.4S, v7.4S, v10.s[2] -ldr q18, [x17, #+528] -mul v25.4S, v25.4S,v22.s[1] -sub v24.4s, v11.4s, v19.4s -str q24, [x0, #528] -mul v1.4S, v1.4S,v22.s[2] -add v11.4s, v11.4s, v19.4s -str q11, [x0, #512] -mla v25.4S, v5.4S, v31.s[0] -sub v5.4s, v27.4s, v9.4s -str q5, [x0, #560] -mla v1.4S, v8.4S, v31.s[0] -add v27.4s, v27.4s, v9.4s -str q27, [x0, #544] -mul v21.4S, v21.4S,v3.s[1] -sub v20.4s, v0.4s, v26.4s -str q20, [x0, #592] -mul v7.4S, v7.4S,v3.s[2] -add v0.4s, v0.4s, v26.4s -str q0, [x0, #576] -mla v21.4S, v6.4S, v31.s[0] -sub v6.4s, v28.4s, v17.4s -str q6, [x0, #624] -mla v7.4S, v16.4S, v31.s[0] -add v28.4s, v28.4s, v17.4s -str q28, [x0, #608] -sqrdmulh v10.4S, v30.4S, v18.s[0] -sub v3.4s, v14.4s, v25.4s -mul v30.4S, v30.4S,v29.s[0] -str q3, [x0, #656] -ldr q3, [x0, #816] -sqrdmulh v28.4S, v3.4S, v18.s[0] -add v14.4s, v14.4s, v25.4s -mul v3.4S, v3.4S,v29.s[0] -str q14, [x0, #640] -ldr q14, [x17, #+544] -ldr q25, [x17, #+560] -ldr q17, [x0, #864] -sqrdmulh v16.4S, v17.4S, v25.s[0] -sub v6.4s, v12.4s, v1.4s -mul v17.4S, v17.4S,v14.s[0] -str q6, [x0, #688] -ldr q6, [x0, #880] -sqrdmulh v0.4S, v6.4S, v25.s[0] -add v12.4s, v12.4s, v1.4s -mul v6.4S, v6.4S,v14.s[0] -str q12, [x0, #672] -ldr q12, [x17, #+576] -ldr q1, [x17, #+592] -mla v30.4S, v10.4S, v31.s[0] -sub v10.4s, v2.4s, v21.4s -sqrdmulh v26.4S, v13.4S, v1.s[0] -str q10, [x0, #720] -ldr q10, [x0, #944] -mla v3.4S, v28.4S, v31.s[0] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v1.s[0] -str q2, [x0, #704] -ldr q2, [x17, #+608] -ldr q28, [x17, #+624] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v4.4s, v7.4s -sqrdmulh v20.4S, v15.4S, v28.s[0] -str q16, [x0, #752] -ldr q16, [x0, #1008] -mla v6.4S, v0.4S, v31.s[0] -add v4.4s, v4.4s, v7.4s -sqrdmulh v7.4S, v16.4S, v28.s[0] -str q4, [x0, #736] -ldr q4, [x0, #896] -ldr q0, [x0, #768] -mul v13.4S, v13.4S,v12.s[0] -sub v22.4s, v0.4s, v30.4s -mul v10.4S, v10.4S,v12.s[0] -add v0.4s, v0.4s, v30.4s -ldr q30, [x0, #912] -ldr q27, [x0, #784] -mla v13.4S, v26.4S, v31.s[0] -sub v26.4s, v27.4s, v3.4s -mla v10.4S, v21.4S, v31.s[0] -add v27.4s, v27.4s, v3.4s -ldr q3, [x0, #960] -ldr q21, [x0, #832] -mul v15.4S, v15.4S,v2.s[0] -sub v9.4s, v21.4s, v17.4s -mul v16.4S, v16.4S,v2.s[0] -add v21.4s, v21.4s, v17.4s -ldr q17, [x0, #976] -ldr q8, [x0, #848] -mla v15.4S, v20.4S, v31.s[0] -mla v16.4S, v7.4S, v31.s[0] -sub v7.4s, v8.4s, v6.4s -sqrdmulh v20.4S, v27.4S, v18.s[1] -mul v27.4S, v27.4S,v29.s[1] -add v8.4s, v8.4s, v6.4s -sqrdmulh v6.4S, v26.4S, v18.s[2] -sub v5.4s, v4.4s, v13.4s -mul v26.4S, v26.4S,v29.s[2] -add v4.4s, v4.4s, v13.4s -sqrdmulh v18.4S, v8.4S, v25.s[1] -sub v29.4s, v30.4s, v10.4s -mul v8.4S, v8.4S,v14.s[1] -add v30.4s, v30.4s, v10.4s -sqrdmulh v10.4S, v7.4S, v25.s[2] -sub v13.4s, v3.4s, v15.4s -mul v7.4S, v7.4S,v14.s[2] -add v3.4s, v3.4s, v15.4s -mla v27.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v16.4s -sqrdmulh v25.4S, v30.4S, v1.s[1] -add v17.4s, v17.4s, v16.4s -mla v26.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v29.4S, v1.s[2] -mla v8.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v17.4S, v28.s[1] -mla v7.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v20.4S, v28.s[2] -mul v30.4S, v30.4S,v12.s[1] -sub v16.4s, v0.4s, v27.4s -str q16, [x0, #784] -mul v29.4S, v29.4S,v12.s[2] -add v0.4s, v0.4s, v27.4s -str q0, [x0, #768] -mla v30.4S, v25.4S, v31.s[0] -sub v25.4s, v22.4s, v26.4s -str q25, [x0, #816] -mla v29.4S, v6.4S, v31.s[0] -add v22.4s, v22.4s, v26.4s -str q22, [x0, #800] -mul v17.4S, v17.4S,v2.s[1] -sub v1.4s, v21.4s, v8.4s -str q1, [x0, #848] -mul v20.4S, v20.4S,v2.s[2] -add v21.4s, v21.4s, v8.4s -str q21, [x0, #832] -mla v17.4S, v18.4S, v31.s[0] -sub v18.4s, v9.4s, v7.4s -str q18, [x0, #880] -mla v20.4S, v10.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q9, [x0, #864] -sub v28.4s, v4.4s, v30.4s -str q28, [x0, #912] -add v4.4s, v4.4s, v30.4s -str q4, [x0, #896] -sub v4.4s, v5.4s, v29.4s -str q4, [x0, #944] -add v5.4s, v5.4s, v29.4s -str q5, [x0, #928] -sub v5.4s, v3.4s, v17.4s -str q5, [x0, #976] -add v3.4s, v3.4s, v17.4s -str q3, [x0, #960] -sub v3.4s, v13.4s, v20.4s -str q3, [x0, #1008] -add v13.4s, v13.4s, v20.4s -str q13, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s deleted file mode 100644 index 982af55..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_0.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_0: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x17, #+128] -ldr q1, [x17, #+144] -ldr q10, [x17, #+160] -ldr q21, [x17, #+176] -ldr q22, [x17, #+192] -ldr q15, [x17, #+208] -ldr q3, [x17, #+224] -ldr q12, [x17, #+240] -ldr q4, [x0, #32] -ldr q30, [x0, #48] -ldr q29, [x0, #0] -ldr q28, [x0, #16] -sqrdmulh v27.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -ldr q30, [x17, #+256] -ldr q26, [x17, #+272] -sqrdmulh v25.4S, v28.4S, v1.s[1] -mul v28.4S, v28.4S,v18.s[1] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v18.s[2] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v4.4s -add v27.4s, v27.4s, v4.4s -str q29, [x0, #0] -str q25, [x0, #16] -str q27, [x0, #32] -str q28, [x0, #48] -ldr q28, [x0, #96] -ldr q27, [x0, #112] -ldr q25, [x0, #64] -ldr q29, [x0, #80] -sqrdmulh v4.4S, v28.4S, v21.s[0] -mul v28.4S, v28.4S,v10.s[0] -mla v28.4S, v4.4S, v31.s[0] -sub v4.4s, v25.4s, v28.4s -add v25.4s, v25.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v21.s[0] -mul v27.4S, v27.4S,v10.s[0] -mla v27.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -ldr q27, [x17, #+288] -ldr q24, [x17, #+304] -sqrdmulh v20.4S, v29.4S, v21.s[1] -mul v29.4S, v29.4S,v10.s[1] -mla v29.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v29.4s -add v25.4s, v25.4s, v29.4s -sqrdmulh v29.4S, v28.4S, v21.s[2] -mul v28.4S, v28.4S,v10.s[2] -mla v28.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v28.4s -add v4.4s, v4.4s, v28.4s -str q25, [x0, #64] -str q20, [x0, #80] -str q4, [x0, #96] -str q29, [x0, #112] -ldr q29, [x0, #160] -ldr q4, [x0, #176] -ldr q20, [x0, #128] -ldr q25, [x0, #144] -sqrdmulh v28.4S, v29.4S, v15.s[0] -mul v29.4S, v29.4S,v22.s[0] -mla v29.4S, v28.4S, v31.s[0] -sub v28.4s, v20.4s, v29.4s -add v20.4s, v20.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v15.s[0] -mul v4.4S, v4.4S,v22.s[0] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v4.4s -add v25.4s, v25.4s, v4.4s -ldr q4, [x17, #+320] -ldr q6, [x17, #+336] -sqrdmulh v5.4S, v25.4S, v15.s[1] -mul v25.4S, v25.4S,v22.s[1] -mla v25.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v25.4s -add v20.4s, v20.4s, v25.4s -sqrdmulh v25.4S, v29.4S, v15.s[2] -mul v29.4S, v29.4S,v22.s[2] -mla v29.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -str q20, [x0, #128] -str q5, [x0, #144] -str q28, [x0, #160] -str q25, [x0, #176] -ldr q25, [x0, #224] -ldr q28, [x0, #240] -ldr q5, [x0, #192] -ldr q20, [x0, #208] -sqrdmulh v29.4S, v25.4S, v12.s[0] -mul v25.4S, v25.4S,v3.s[0] -mla v25.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v25.4s -add v5.4s, v5.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v12.s[0] -mul v28.4S, v28.4S,v3.s[0] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v20.4s, v28.4s -add v20.4s, v20.4s, v28.4s -ldr q28, [x17, #+352] -ldr q16, [x17, #+368] -sqrdmulh v11.4S, v20.4S, v12.s[1] -mul v20.4S, v20.4S,v3.s[1] -mla v20.4S, v11.4S, v31.s[0] -sub v11.4s, v5.4s, v20.4s -add v5.4s, v5.4s, v20.4s -sqrdmulh v20.4S, v25.4S, v12.s[2] -mul v25.4S, v25.4S,v3.s[2] -mla v25.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v25.4s -add v29.4s, v29.4s, v25.4s -str q5, [x0, #192] -str q11, [x0, #208] -str q29, [x0, #224] -str q20, [x0, #240] -ldr q20, [x0, #288] -ldr q29, [x0, #304] -ldr q11, [x0, #256] -ldr q5, [x0, #272] -sqrdmulh v25.4S, v20.4S, v26.s[0] -mul v20.4S, v20.4S,v30.s[0] -mla v20.4S, v25.4S, v31.s[0] -sub v25.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v29.4S, v26.s[0] -mul v29.4S, v29.4S,v30.s[0] -mla v29.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v29.4s -add v5.4s, v5.4s, v29.4s -ldr q29, [x17, #+384] -ldr q2, [x17, #+400] -sqrdmulh v23.4S, v5.4S, v26.s[1] -mul v5.4S, v5.4S,v30.s[1] -mla v5.4S, v23.4S, v31.s[0] -sub v23.4s, v11.4s, v5.4s -add v11.4s, v11.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v26.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v20.4S, v5.4S, v31.s[0] -sub v5.4s, v25.4s, v20.4s -add v25.4s, v25.4s, v20.4s -str q11, [x0, #256] -str q23, [x0, #272] -str q25, [x0, #288] -str q5, [x0, #304] -ldr q1, [x0, #352] -ldr q18, [x0, #368] -ldr q5, [x0, #320] -ldr q25, [x0, #336] -sqrdmulh v23.4S, v1.4S, v24.s[0] -mul v1.4S, v1.4S,v27.s[0] -mla v1.4S, v23.4S, v31.s[0] -sub v23.4s, v5.4s, v1.4s -add v5.4s, v5.4s, v1.4s -sqrdmulh v1.4S, v18.4S, v24.s[0] -mul v18.4S, v18.4S,v27.s[0] -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v25.4s, v18.4s -add v25.4s, v25.4s, v18.4s -ldr q18, [x17, #+416] -ldr q11, [x17, #+432] -sqrdmulh v20.4S, v25.4S, v24.s[1] -mul v25.4S, v25.4S,v27.s[1] -mla v25.4S, v20.4S, v31.s[0] -sub v20.4s, v5.4s, v25.4s -add v5.4s, v5.4s, v25.4s -sqrdmulh v25.4S, v1.4S, v24.s[2] -mul v1.4S, v1.4S,v27.s[2] -mla v1.4S, v25.4S, v31.s[0] -sub v25.4s, v23.4s, v1.4s -add v23.4s, v23.4s, v1.4s -str q5, [x0, #320] -str q20, [x0, #336] -str q23, [x0, #352] -str q25, [x0, #368] -ldr q21, [x0, #416] -ldr q10, [x0, #432] -ldr q25, [x0, #384] -ldr q23, [x0, #400] -sqrdmulh v20.4S, v21.4S, v6.s[0] -mul v21.4S, v21.4S,v4.s[0] -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v21.4s -add v25.4s, v25.4s, v21.4s -sqrdmulh v21.4S, v10.4S, v6.s[0] -mul v10.4S, v10.4S,v4.s[0] -mla v10.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v10.4s -add v23.4s, v23.4s, v10.4s -ldr q10, [x17, #+448] -ldr q5, [x17, #+464] -sqrdmulh v1.4S, v23.4S, v6.s[1] -mul v23.4S, v23.4S,v4.s[1] -mla v23.4S, v1.4S, v31.s[0] -sub v1.4s, v25.4s, v23.4s -add v25.4s, v25.4s, v23.4s -sqrdmulh v23.4S, v21.4S, v6.s[2] -mul v21.4S, v21.4S,v4.s[2] -mla v21.4S, v23.4S, v31.s[0] -sub v23.4s, v20.4s, v21.4s -add v20.4s, v20.4s, v21.4s -str q25, [x0, #384] -str q1, [x0, #400] -str q20, [x0, #416] -str q23, [x0, #432] -ldr q15, [x0, #480] -ldr q22, [x0, #496] -ldr q23, [x0, #448] -ldr q20, [x0, #464] -sqrdmulh v1.4S, v15.4S, v16.s[0] -mul v15.4S, v15.4S,v28.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v23.4s, v15.4s -add v23.4s, v23.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v16.s[0] -mul v22.4S, v22.4S,v28.s[0] -mla v22.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v22.4s -add v20.4s, v20.4s, v22.4s -ldr q22, [x17, #+480] -ldr q25, [x17, #+496] -sqrdmulh v21.4S, v20.4S, v16.s[1] -mul v20.4S, v20.4S,v28.s[1] -mla v20.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v20.4s -add v23.4s, v23.4s, v20.4s -sqrdmulh v20.4S, v15.4S, v16.s[2] -mul v15.4S, v15.4S,v28.s[2] -mla v15.4S, v20.4S, v31.s[0] -sub v20.4s, v1.4s, v15.4s -add v1.4s, v1.4s, v15.4s -str q23, [x0, #448] -str q21, [x0, #464] -str q1, [x0, #480] -str q20, [x0, #496] -ldr q12, [x0, #544] -ldr q3, [x0, #560] -ldr q20, [x0, #512] -ldr q1, [x0, #528] -sqrdmulh v21.4S, v12.4S, v2.s[0] -mul v12.4S, v12.4S,v29.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -sqrdmulh v12.4S, v3.4S, v2.s[0] -mul v3.4S, v3.4S,v29.s[0] -mla v3.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v3.4s -add v1.4s, v1.4s, v3.4s -ldr q3, [x17, #+512] -ldr q23, [x17, #+528] -sqrdmulh v15.4S, v1.4S, v2.s[1] -mul v1.4S, v1.4S,v29.s[1] -mla v1.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v1.4s -add v20.4s, v20.4s, v1.4s -sqrdmulh v1.4S, v12.4S, v2.s[2] -mul v12.4S, v12.4S,v29.s[2] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v21.4s, v12.4s -add v21.4s, v21.4s, v12.4s -str q20, [x0, #512] -str q15, [x0, #528] -str q21, [x0, #544] -str q1, [x0, #560] -ldr q26, [x0, #608] -ldr q30, [x0, #624] -ldr q1, [x0, #576] -ldr q21, [x0, #592] -sqrdmulh v15.4S, v26.4S, v11.s[0] -mul v26.4S, v26.4S,v18.s[0] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v1.4s, v26.4s -add v1.4s, v1.4s, v26.4s -sqrdmulh v26.4S, v30.4S, v11.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v30.4s -add v21.4s, v21.4s, v30.4s -ldr q30, [x17, #+544] -ldr q20, [x17, #+560] -sqrdmulh v12.4S, v21.4S, v11.s[1] -mul v21.4S, v21.4S,v18.s[1] -mla v21.4S, v12.4S, v31.s[0] -sub v12.4s, v1.4s, v21.4s -add v1.4s, v1.4s, v21.4s -sqrdmulh v21.4S, v26.4S, v11.s[2] -mul v26.4S, v26.4S,v18.s[2] -mla v26.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v26.4s -add v15.4s, v15.4s, v26.4s -str q1, [x0, #576] -str q12, [x0, #592] -str q15, [x0, #608] -str q21, [x0, #624] -ldr q24, [x0, #672] -ldr q27, [x0, #688] -ldr q21, [x0, #640] -ldr q15, [x0, #656] -sqrdmulh v12.4S, v24.4S, v5.s[0] -mul v24.4S, v24.4S,v10.s[0] -mla v24.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v24.4s -add v21.4s, v21.4s, v24.4s -sqrdmulh v24.4S, v27.4S, v5.s[0] -mul v27.4S, v27.4S,v10.s[0] -mla v27.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v27.4s -add v15.4s, v15.4s, v27.4s -ldr q27, [x17, #+576] -ldr q1, [x17, #+592] -sqrdmulh v26.4S, v15.4S, v5.s[1] -mul v15.4S, v15.4S,v10.s[1] -mla v15.4S, v26.4S, v31.s[0] -sub v26.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -sqrdmulh v15.4S, v24.4S, v5.s[2] -mul v24.4S, v24.4S,v10.s[2] -mla v24.4S, v15.4S, v31.s[0] -sub v15.4s, v12.4s, v24.4s -add v12.4s, v12.4s, v24.4s -str q21, [x0, #640] -str q26, [x0, #656] -str q12, [x0, #672] -str q15, [x0, #688] -ldr q6, [x0, #736] -ldr q4, [x0, #752] -ldr q15, [x0, #704] -ldr q12, [x0, #720] -sqrdmulh v26.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v22.s[0] -mla v6.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -sqrdmulh v6.4S, v4.4S, v25.s[0] -mul v4.4S, v4.4S,v22.s[0] -mla v4.4S, v6.4S, v31.s[0] -sub v6.4s, v12.4s, v4.4s -add v12.4s, v12.4s, v4.4s -ldr q4, [x17, #+608] -ldr q21, [x17, #+624] -sqrdmulh v24.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v22.s[1] -mla v12.4S, v24.4S, v31.s[0] -sub v24.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v6.4S, v25.s[2] -mul v6.4S, v6.4S,v22.s[2] -mla v6.4S, v12.4S, v31.s[0] -sub v12.4s, v26.4s, v6.4s -add v26.4s, v26.4s, v6.4s -str q15, [x0, #704] -str q24, [x0, #720] -str q26, [x0, #736] -str q12, [x0, #752] -ldr q16, [x0, #800] -ldr q28, [x0, #816] -ldr q12, [x0, #768] -ldr q26, [x0, #784] -sqrdmulh v24.4S, v16.4S, v23.s[0] -mul v16.4S, v16.4S,v3.s[0] -mla v16.4S, v24.4S, v31.s[0] -sub v24.4s, v12.4s, v16.4s -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v28.4S, v23.s[0] -mul v28.4S, v28.4S,v3.s[0] -mla v28.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v28.4s -add v26.4s, v26.4s, v28.4s -sqrdmulh v28.4S, v26.4S, v23.s[1] -mul v26.4S, v26.4S,v3.s[1] -mla v26.4S, v28.4S, v31.s[0] -sub v28.4s, v12.4s, v26.4s -add v12.4s, v12.4s, v26.4s -sqrdmulh v26.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v3.s[2] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v24.4s, v16.4s -add v24.4s, v24.4s, v16.4s -str q12, [x0, #768] -str q28, [x0, #784] -str q24, [x0, #800] -str q26, [x0, #816] -ldr q2, [x0, #864] -ldr q29, [x0, #880] -ldr q26, [x0, #832] -ldr q24, [x0, #848] -sqrdmulh v28.4S, v2.4S, v20.s[0] -mul v2.4S, v2.4S,v30.s[0] -mla v2.4S, v28.4S, v31.s[0] -sub v28.4s, v26.4s, v2.4s -add v26.4s, v26.4s, v2.4s -sqrdmulh v2.4S, v29.4S, v20.s[0] -mul v29.4S, v29.4S,v30.s[0] -mla v29.4S, v2.4S, v31.s[0] -sub v2.4s, v24.4s, v29.4s -add v24.4s, v24.4s, v29.4s -sqrdmulh v29.4S, v24.4S, v20.s[1] -mul v24.4S, v24.4S,v30.s[1] -mla v24.4S, v29.4S, v31.s[0] -sub v29.4s, v26.4s, v24.4s -add v26.4s, v26.4s, v24.4s -sqrdmulh v24.4S, v2.4S, v20.s[2] -mul v2.4S, v2.4S,v30.s[2] -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v2.4s -add v28.4s, v28.4s, v2.4s -str q26, [x0, #832] -str q29, [x0, #848] -str q28, [x0, #864] -str q24, [x0, #880] -ldr q11, [x0, #928] -ldr q18, [x0, #944] -ldr q24, [x0, #896] -ldr q28, [x0, #912] -sqrdmulh v29.4S, v11.4S, v1.s[0] -mul v11.4S, v11.4S,v27.s[0] -mla v11.4S, v29.4S, v31.s[0] -sub v29.4s, v24.4s, v11.4s -add v24.4s, v24.4s, v11.4s -sqrdmulh v11.4S, v18.4S, v1.s[0] -mul v18.4S, v18.4S,v27.s[0] -mla v18.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v18.4s -add v28.4s, v28.4s, v18.4s -sqrdmulh v18.4S, v28.4S, v1.s[1] -mul v28.4S, v28.4S,v27.s[1] -mla v28.4S, v18.4S, v31.s[0] -sub v18.4s, v24.4s, v28.4s -add v24.4s, v24.4s, v28.4s -sqrdmulh v28.4S, v11.4S, v1.s[2] -mul v11.4S, v11.4S,v27.s[2] -mla v11.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -str q24, [x0, #896] -str q18, [x0, #912] -str q29, [x0, #928] -str q28, [x0, #944] -ldr q5, [x0, #992] -ldr q10, [x0, #1008] -ldr q28, [x0, #960] -ldr q29, [x0, #976] -sqrdmulh v18.4S, v5.4S, v21.s[0] -mul v5.4S, v5.4S,v4.s[0] -mla v5.4S, v18.4S, v31.s[0] -sub v18.4s, v28.4s, v5.4s -add v28.4s, v28.4s, v5.4s -sqrdmulh v5.4S, v10.4S, v21.s[0] -mul v10.4S, v10.4S,v4.s[0] -mla v10.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v10.4s -add v29.4s, v29.4s, v10.4s -sqrdmulh v10.4S, v29.4S, v21.s[1] -mul v29.4S, v29.4S,v4.s[1] -mla v29.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v29.4s -add v28.4s, v28.4s, v29.4s -sqrdmulh v29.4S, v5.4S, v21.s[2] -mul v5.4S, v5.4S,v4.s[2] -mla v5.4S, v29.4S, v31.s[0] -sub v29.4s, v18.4s, v5.4s -add v18.4s, v18.4s, v5.4s -str q28, [x0, #960] -str q10, [x0, #976] -str q18, [x0, #992] -str q29, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s deleted file mode 100644 index ab592c3..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_1.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_1: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #32] -ldr q1, [x0, #48] -ldr q10, [x0, #0] -ldr q21, [x0, #16] -ldr q22, [x0, #96] -ldr q15, [x0, #112] -ldr q3, [x0, #64] -ldr q12, [x0, #80] -ldr q4, [x0, #160] -ldr q30, [x0, #176] -ldr q29, [x0, #128] -ldr q28, [x0, #144] -ldr q27, [x0, #224] -ldr q26, [x0, #240] -ldr q25, [x0, #192] -ldr q24, [x0, #208] -ldr q20, [x17, #+128] -ldr q6, [x17, #+144] -ldr q5, [x17, #+160] -ldr q16, [x17, #+176] -ldr q11, [x17, #+192] -ldr q2, [x17, #+208] -ldr q23, [x17, #+224] -ldr q0, [x17, #+240] -sqrdmulh v13.4S, v18.4S, v6.s[0] -mul v18.4S, v18.4S,v20.s[0] -sqrdmulh v14.4S, v1.4S, v6.s[0] -mul v1.4S, v1.4S,v20.s[0] -mla v18.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v22.4S, v16.s[0] -mul v22.4S, v22.4S,v5.s[0] -mla v1.4S, v14.4S, v31.s[0] -sub v14.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v15.4S, v16.s[0] -mul v15.4S, v15.4S,v5.s[0] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v21.4S, v6.s[1] -mul v21.4S, v21.4S,v20.s[1] -mla v15.4S, v18.4S, v31.s[0] -sub v18.4s, v3.4s, v22.4s -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v13.4S, v6.s[2] -mul v13.4S, v13.4S,v20.s[2] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v15.4s -add v12.4s, v12.4s, v15.4s -sqrdmulh v15.4S, v12.4S, v16.s[1] -mul v12.4S, v12.4S,v5.s[1] -mla v13.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -add v10.4s, v10.4s, v21.4s -sqrdmulh v6.4S, v1.4S, v16.s[2] -mul v1.4S, v1.4S,v5.s[2] -mla v12.4S, v15.4S, v31.s[0] -sub v15.4s, v14.4s, v13.4s -add v14.4s, v14.4s, v13.4s -sqrdmulh v13.4S, v4.4S, v2.s[0] -mul v4.4S, v4.4S,v11.s[0] -mla v1.4S, v6.4S, v31.s[0] -sub v6.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -sqrdmulh v16.4S, v30.4S, v2.s[0] -mul v30.4S, v30.4S,v11.s[0] -mla v4.4S, v13.4S, v31.s[0] -sub v13.4s, v18.4s, v1.4s -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v27.4S, v0.s[0] -mul v27.4S, v27.4S,v23.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v26.4S, v0.s[0] -mul v26.4S, v26.4S,v23.s[0] -mla v27.4S, v1.4S, v31.s[0] -sub v1.4s, v28.4s, v30.4s -add v28.4s, v28.4s, v30.4s -sqrdmulh v30.4S, v28.4S, v2.s[1] -mul v28.4S, v28.4S,v11.s[1] -mla v26.4S, v4.4S, v31.s[0] -sub v4.4s, v25.4s, v27.4s -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v1.4S, v2.s[2] -mul v1.4S, v1.4S,v11.s[2] -mla v28.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v26.4s -add v24.4s, v24.4s, v26.4s -sqrdmulh v26.4S, v24.4S, v0.s[1] -mul v24.4S, v24.4S,v23.s[1] -mla v1.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v28.4s -add v29.4s, v29.4s, v28.4s -sqrdmulh v2.4S, v30.4S, v0.s[2] -mul v30.4S, v30.4S,v23.s[2] -mla v24.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v1.4s -add v16.4s, v16.4s, v1.4s -mla v30.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sub v0.4s, v4.4s, v30.4s -add v4.4s, v4.4s, v30.4s -str q10, [x0, #0] -str q22, [x0, #16] -str q14, [x0, #32] -str q15, [x0, #48] -str q3, [x0, #64] -str q6, [x0, #80] -str q18, [x0, #96] -str q13, [x0, #112] -str q29, [x0, #128] -str q27, [x0, #144] -str q16, [x0, #160] -str q26, [x0, #176] -str q25, [x0, #192] -str q2, [x0, #208] -str q4, [x0, #224] -str q0, [x0, #240] -ldr q0, [x0, #288] -ldr q4, [x0, #304] -ldr q2, [x0, #256] -ldr q25, [x0, #272] -ldr q26, [x0, #352] -ldr q16, [x0, #368] -ldr q27, [x0, #320] -ldr q29, [x0, #336] -ldr q13, [x0, #416] -ldr q18, [x0, #432] -ldr q6, [x0, #384] -ldr q3, [x0, #400] -ldr q15, [x0, #480] -ldr q14, [x0, #496] -ldr q22, [x0, #448] -ldr q10, [x0, #464] -ldr q30, [x17, #+256] -ldr q23, [x17, #+272] -ldr q24, [x17, #+288] -ldr q1, [x17, #+304] -ldr q11, [x17, #+320] -ldr q28, [x17, #+336] -ldr q5, [x17, #+352] -ldr q12, [x17, #+368] -sqrdmulh v20.4S, v0.4S, v23.s[0] -mul v0.4S, v0.4S,v30.s[0] -sqrdmulh v21.4S, v4.4S, v23.s[0] -mul v4.4S, v4.4S,v30.s[0] -mla v0.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v26.4S, v1.s[0] -mul v26.4S, v26.4S,v24.s[0] -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v1.s[0] -mul v16.4S, v16.4S,v24.s[0] -mla v26.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v4.4s -add v25.4s, v25.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v23.s[1] -mul v25.4S, v25.4S,v30.s[1] -mla v16.4S, v0.4S, v31.s[0] -sub v0.4s, v27.4s, v26.4s -add v27.4s, v27.4s, v26.4s -sqrdmulh v26.4S, v20.4S, v23.s[2] -mul v20.4S, v20.4S,v30.s[2] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v16.4s -add v29.4s, v29.4s, v16.4s -sqrdmulh v16.4S, v29.4S, v1.s[1] -mul v29.4S, v29.4S,v24.s[1] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v25.4s -add v2.4s, v2.4s, v25.4s -sqrdmulh v23.4S, v4.4S, v1.s[2] -mul v4.4S, v4.4S,v24.s[2] -mla v29.4S, v16.4S, v31.s[0] -sub v16.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v13.4S, v28.s[0] -mul v13.4S, v13.4S,v11.s[0] -mla v4.4S, v23.4S, v31.s[0] -sub v23.4s, v27.4s, v29.4s -add v27.4s, v27.4s, v29.4s -sqrdmulh v1.4S, v18.4S, v28.s[0] -mul v18.4S, v18.4S,v11.s[0] -mla v13.4S, v20.4S, v31.s[0] -sub v20.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v15.4S, v12.s[0] -mul v15.4S, v15.4S,v5.s[0] -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -sqrdmulh v13.4S, v14.4S, v12.s[0] -mul v14.4S, v14.4S,v5.s[0] -mla v15.4S, v4.4S, v31.s[0] -sub v4.4s, v3.4s, v18.4s -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v28.s[1] -mul v3.4S, v3.4S,v11.s[1] -mla v14.4S, v13.4S, v31.s[0] -sub v13.4s, v22.4s, v15.4s -add v22.4s, v22.4s, v15.4s -sqrdmulh v15.4S, v4.4S, v28.s[2] -mul v4.4S, v4.4S,v11.s[2] -mla v3.4S, v18.4S, v31.s[0] -sub v18.4s, v10.4s, v14.4s -add v10.4s, v10.4s, v14.4s -sqrdmulh v14.4S, v10.4S, v12.s[1] -mul v10.4S, v10.4S,v5.s[1] -mla v4.4S, v15.4S, v31.s[0] -sub v15.4s, v6.4s, v3.4s -add v6.4s, v6.4s, v3.4s -sqrdmulh v28.4S, v18.4S, v12.s[2] -mul v18.4S, v18.4S,v5.s[2] -mla v10.4S, v14.4S, v31.s[0] -sub v14.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -sub v12.4s, v13.4s, v18.4s -add v13.4s, v13.4s, v18.4s -str q2, [x0, #256] -str q26, [x0, #272] -str q21, [x0, #288] -str q16, [x0, #304] -str q27, [x0, #320] -str q23, [x0, #336] -str q0, [x0, #352] -str q20, [x0, #368] -str q6, [x0, #384] -str q15, [x0, #400] -str q1, [x0, #416] -str q14, [x0, #432] -str q22, [x0, #448] -str q28, [x0, #464] -str q13, [x0, #480] -str q12, [x0, #496] -ldr q12, [x0, #544] -ldr q13, [x0, #560] -ldr q28, [x0, #512] -ldr q22, [x0, #528] -ldr q14, [x0, #608] -ldr q1, [x0, #624] -ldr q15, [x0, #576] -ldr q6, [x0, #592] -ldr q20, [x0, #672] -ldr q0, [x0, #688] -ldr q23, [x0, #640] -ldr q27, [x0, #656] -ldr q16, [x0, #736] -ldr q21, [x0, #752] -ldr q26, [x0, #704] -ldr q2, [x0, #720] -ldr q18, [x17, #+384] -ldr q5, [x17, #+400] -ldr q10, [x17, #+416] -ldr q4, [x17, #+432] -ldr q11, [x17, #+448] -ldr q3, [x17, #+464] -ldr q24, [x17, #+480] -ldr q29, [x17, #+496] -sqrdmulh v30.4S, v12.4S, v5.s[0] -mul v12.4S, v12.4S,v18.s[0] -sqrdmulh v25.4S, v13.4S, v5.s[0] -mul v13.4S, v13.4S,v18.s[0] -mla v12.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v14.4S, v4.s[0] -mul v14.4S, v14.4S,v10.s[0] -mla v13.4S, v25.4S, v31.s[0] -sub v25.4s, v28.4s, v12.4s -add v28.4s, v28.4s, v12.4s -sqrdmulh v12.4S, v1.4S, v4.s[0] -mul v1.4S, v1.4S,v10.s[0] -mla v14.4S, v30.4S, v31.s[0] -sub v30.4s, v22.4s, v13.4s -add v22.4s, v22.4s, v13.4s -sqrdmulh v13.4S, v22.4S, v5.s[1] -mul v22.4S, v22.4S,v18.s[1] -mla v1.4S, v12.4S, v31.s[0] -sub v12.4s, v15.4s, v14.4s -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v30.4S, v5.s[2] -mul v30.4S, v30.4S,v18.s[2] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -sqrdmulh v1.4S, v6.4S, v4.s[1] -mul v6.4S, v6.4S,v10.s[1] -mla v30.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v22.4s -add v28.4s, v28.4s, v22.4s -sqrdmulh v5.4S, v13.4S, v4.s[2] -mul v13.4S, v13.4S,v10.s[2] -mla v6.4S, v1.4S, v31.s[0] -sub v1.4s, v25.4s, v30.4s -add v25.4s, v25.4s, v30.4s -sqrdmulh v30.4S, v20.4S, v3.s[0] -mul v20.4S, v20.4S,v11.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v15.4s, v6.4s -add v15.4s, v15.4s, v6.4s -sqrdmulh v4.4S, v0.4S, v3.s[0] -mul v0.4S, v0.4S,v11.s[0] -mla v20.4S, v30.4S, v31.s[0] -sub v30.4s, v12.4s, v13.4s -add v12.4s, v12.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v24.s[0] -mla v0.4S, v4.4S, v31.s[0] -sub v4.4s, v23.4s, v20.4s -add v23.4s, v23.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v24.s[0] -mla v16.4S, v13.4S, v31.s[0] -sub v13.4s, v27.4s, v0.4s -add v27.4s, v27.4s, v0.4s -sqrdmulh v0.4S, v27.4S, v3.s[1] -mul v27.4S, v27.4S,v11.s[1] -mla v21.4S, v20.4S, v31.s[0] -sub v20.4s, v26.4s, v16.4s -add v26.4s, v26.4s, v16.4s -sqrdmulh v16.4S, v13.4S, v3.s[2] -mul v13.4S, v13.4S,v11.s[2] -mla v27.4S, v0.4S, v31.s[0] -sub v0.4s, v2.4s, v21.4s -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v29.s[1] -mul v2.4S, v2.4S,v24.s[1] -mla v13.4S, v16.4S, v31.s[0] -sub v16.4s, v23.4s, v27.4s -add v23.4s, v23.4s, v27.4s -sqrdmulh v3.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v24.s[2] -mla v2.4S, v21.4S, v31.s[0] -sub v21.4s, v4.4s, v13.4s -add v4.4s, v4.4s, v13.4s -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v26.4s, v2.4s -add v26.4s, v26.4s, v2.4s -sub v29.4s, v20.4s, v0.4s -add v20.4s, v20.4s, v0.4s -str q28, [x0, #512] -str q14, [x0, #528] -str q25, [x0, #544] -str q1, [x0, #560] -str q15, [x0, #576] -str q5, [x0, #592] -str q12, [x0, #608] -str q30, [x0, #624] -str q23, [x0, #640] -str q16, [x0, #656] -str q4, [x0, #672] -str q21, [x0, #688] -str q26, [x0, #704] -str q3, [x0, #720] -str q20, [x0, #736] -str q29, [x0, #752] -ldr q29, [x0, #800] -ldr q20, [x0, #816] -ldr q3, [x0, #768] -ldr q26, [x0, #784] -ldr q21, [x0, #864] -ldr q4, [x0, #880] -ldr q16, [x0, #832] -ldr q23, [x0, #848] -ldr q30, [x0, #928] -ldr q12, [x0, #944] -ldr q5, [x0, #896] -ldr q15, [x0, #912] -ldr q1, [x0, #992] -ldr q25, [x0, #1008] -ldr q14, [x0, #960] -ldr q28, [x0, #976] -ldr q0, [x17, #+512] -ldr q24, [x17, #+528] -ldr q2, [x17, #+544] -ldr q13, [x17, #+560] -ldr q11, [x17, #+576] -ldr q27, [x17, #+592] -ldr q10, [x17, #+608] -ldr q6, [x17, #+624] -sqrdmulh v18.4S, v29.4S, v24.s[0] -mul v29.4S, v29.4S,v0.s[0] -sqrdmulh v22.4S, v20.4S, v24.s[0] -mul v20.4S, v20.4S,v0.s[0] -mla v29.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v21.4S, v13.s[0] -mul v21.4S, v21.4S,v2.s[0] -mla v20.4S, v22.4S, v31.s[0] -sub v22.4s, v3.4s, v29.4s -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v4.4S, v13.s[0] -mul v4.4S, v4.4S,v2.s[0] -mla v21.4S, v18.4S, v31.s[0] -sub v18.4s, v26.4s, v20.4s -add v26.4s, v26.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v24.s[1] -mul v26.4S, v26.4S,v0.s[1] -mla v4.4S, v29.4S, v31.s[0] -sub v29.4s, v16.4s, v21.4s -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v24.s[2] -mul v18.4S, v18.4S,v0.s[2] -mla v26.4S, v20.4S, v31.s[0] -sub v20.4s, v23.4s, v4.4s -add v23.4s, v23.4s, v4.4s -sqrdmulh v4.4S, v23.4S, v13.s[1] -mul v23.4S, v23.4S,v2.s[1] -mla v18.4S, v21.4S, v31.s[0] -sub v21.4s, v3.4s, v26.4s -add v3.4s, v3.4s, v26.4s -sqrdmulh v24.4S, v20.4S, v13.s[2] -mul v20.4S, v20.4S,v2.s[2] -mla v23.4S, v4.4S, v31.s[0] -sub v4.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -sqrdmulh v18.4S, v30.4S, v27.s[0] -mul v30.4S, v30.4S,v11.s[0] -mla v20.4S, v24.4S, v31.s[0] -sub v24.4s, v16.4s, v23.4s -add v16.4s, v16.4s, v23.4s -sqrdmulh v13.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v11.s[0] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v29.4s, v20.4s -add v29.4s, v29.4s, v20.4s -sqrdmulh v20.4S, v1.4S, v6.s[0] -mul v1.4S, v1.4S,v10.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -sqrdmulh v30.4S, v25.4S, v6.s[0] -mul v25.4S, v25.4S,v10.s[0] -mla v1.4S, v20.4S, v31.s[0] -sub v20.4s, v15.4s, v12.4s -add v15.4s, v15.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v11.s[1] -mla v25.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v1.4s -add v14.4s, v14.4s, v1.4s -sqrdmulh v1.4S, v20.4S, v27.s[2] -mul v20.4S, v20.4S,v11.s[2] -mla v15.4S, v12.4S, v31.s[0] -sub v12.4s, v28.4s, v25.4s -add v28.4s, v28.4s, v25.4s -sqrdmulh v25.4S, v28.4S, v6.s[1] -mul v28.4S, v28.4S,v10.s[1] -mla v20.4S, v1.4S, v31.s[0] -sub v1.4s, v5.4s, v15.4s -add v5.4s, v5.4s, v15.4s -sqrdmulh v27.4S, v12.4S, v6.s[2] -mul v12.4S, v12.4S,v10.s[2] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v13.4s, v20.4s -add v13.4s, v13.4s, v20.4s -mla v12.4S, v27.4S, v31.s[0] -sub v27.4s, v14.4s, v28.4s -add v14.4s, v14.4s, v28.4s -sub v6.4s, v30.4s, v12.4s -add v30.4s, v30.4s, v12.4s -str q3, [x0, #768] -str q21, [x0, #784] -str q22, [x0, #800] -str q4, [x0, #816] -str q16, [x0, #832] -str q24, [x0, #848] -str q29, [x0, #864] -str q18, [x0, #880] -str q5, [x0, #896] -str q1, [x0, #912] -str q13, [x0, #928] -str q25, [x0, #944] -str q14, [x0, #960] -str q27, [x0, #976] -str q30, [x0, #992] -str q6, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s deleted file mode 100644 index 807b044..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_10.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_10: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #224] -ldr q1, [x0, #160] -ldr q10, [x0, #32] -ldr q21, [x17, #+128] -ldr q22, [x17, #+144] -sqrdmulh v15.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v21.s[0] -ldr q3, [x0, #48] -ldr q12, [x17, #+160] -sqrdmulh v4.4S, v3.4S, v22.s[0] -mul v3.4S, v3.4S,v21.s[0] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v12.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v12.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v1.4S, v24.s[0] -ldr q20, [x0, #176] -mla v3.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v20.4S, v24.s[0] -ldr q6, [x17, #+224] -ldr q5, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v18.4S, v5.s[0] -ldr q16, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v16.4S, v5.s[0] -ldr q11, [x0, #0] -ldr q2, [x0, #128] -mul v1.4S, v1.4S,v25.s[0] -mul v20.4S, v20.4S,v25.s[0] -ldr q23, [x0, #16] -ldr q0, [x0, #144] -mla v1.4S, v15.4S, v31.s[0] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v11.4s, v10.4s -ldr q15, [x0, #64] -add v11.4s, v11.4s, v10.4s -ldr q10, [x0, #192] -mul v18.4S, v18.4S,v6.s[0] -mul v16.4S, v16.4S,v6.s[0] -sub v13.4s, v23.4s, v3.4s -ldr q14, [x0, #80] -add v23.4s, v23.4s, v3.4s -ldr q3, [x0, #208] -mla v18.4S, v28.4S, v31.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v29.4s -add v15.4s, v15.4s, v29.4s -sqrdmulh v29.4S, v23.4S, v22.s[1] -mul v23.4S, v23.4S,v21.s[1] -sub v28.4s, v14.4s, v27.4s -add v14.4s, v14.4s, v27.4s -sqrdmulh v27.4S, v13.4S, v22.s[2] -mul v13.4S, v13.4S,v21.s[2] -sub v19.4s, v2.4s, v1.4s -add v2.4s, v2.4s, v1.4s -sqrdmulh v22.4S, v14.4S, v30.s[1] -mul v14.4S, v14.4S,v12.s[1] -sub v1.4s, v0.4s, v20.4s -add v0.4s, v0.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v30.s[2] -mul v28.4S, v28.4S,v12.s[2] -sub v21.4s, v10.4s, v18.4s -add v10.4s, v10.4s, v18.4s -mla v23.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v0.4S, v24.s[1] -sub v30.4s, v3.4s, v16.4s -ldr q18, [x0, #480] -add v3.4s, v3.4s, v16.4s -mla v13.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v1.4S, v24.s[2] -sub v16.4s, v11.4s, v23.4s -ldr q12, [x0, #416] -str q16, [x0, #16] -mla v14.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v3.4S, v5.s[1] -add v11.4s, v11.4s, v23.4s -ldr q23, [x0, #288] -str q11, [x0, #0] -mla v28.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v30.4S, v5.s[2] -sub v11.4s, v4.4s, v13.4s -ldr q16, [x17, #+256] -str q11, [x0, #48] -mul v0.4S, v0.4S,v25.s[1] -mul v1.4S, v1.4S,v25.s[2] -add v4.4s, v4.4s, v13.4s -str q4, [x0, #32] -ldr q4, [x17, #+272] -mla v0.4S, v29.4S, v31.s[0] -mla v1.4S, v27.4S, v31.s[0] -sub v27.4s, v15.4s, v14.4s -str q27, [x0, #80] -mul v3.4S, v3.4S,v6.s[1] -mul v30.4S, v30.4S,v6.s[2] -add v15.4s, v15.4s, v14.4s -str q15, [x0, #64] -mla v3.4S, v22.4S, v31.s[0] -mla v30.4S, v20.4S, v31.s[0] -sub v20.4s, v26.4s, v28.4s -str q20, [x0, #112] -sqrdmulh v5.4S, v23.4S, v4.s[0] -mul v23.4S, v23.4S,v16.s[0] -add v26.4s, v26.4s, v28.4s -ldr q28, [x0, #304] -str q26, [x0, #96] -ldr q26, [x17, #+288] -sqrdmulh v20.4S, v28.4S, v4.s[0] -mul v28.4S, v28.4S,v16.s[0] -sub v6.4s, v2.4s, v0.4s -ldr q22, [x17, #+304] -str q6, [x0, #144] -ldr q6, [x0, #352] -sqrdmulh v15.4S, v6.4S, v22.s[0] -mul v6.4S, v6.4S,v26.s[0] -add v2.4s, v2.4s, v0.4s -str q2, [x0, #128] -ldr q2, [x0, #368] -sqrdmulh v0.4S, v2.4S, v22.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v14.4s, v19.4s, v1.4s -ldr q24, [x17, #+320] -str q14, [x0, #176] -ldr q14, [x17, #+336] -mla v23.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v12.4S, v14.s[0] -add v19.4s, v19.4s, v1.4s -ldr q1, [x0, #432] -str q19, [x0, #160] -mla v28.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v1.4S, v14.s[0] -sub v19.4s, v10.4s, v3.4s -ldr q27, [x17, #+352] -str q19, [x0, #208] -ldr q19, [x17, #+368] -mla v6.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v18.4S, v19.s[0] -add v10.4s, v10.4s, v3.4s -str q10, [x0, #192] -ldr q10, [x0, #496] -mla v2.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v10.4S, v19.s[0] -sub v3.4s, v21.4s, v30.4s -ldr q25, [x0, #256] -str q3, [x0, #240] -ldr q3, [x0, #384] -mul v12.4S, v12.4S,v24.s[0] -mul v1.4S, v1.4S,v24.s[0] -add v21.4s, v21.4s, v30.4s -ldr q30, [x0, #272] -str q21, [x0, #224] -ldr q21, [x0, #400] -mla v12.4S, v5.4S, v31.s[0] -mla v1.4S, v20.4S, v31.s[0] -sub v20.4s, v25.4s, v23.4s -ldr q5, [x0, #320] -add v25.4s, v25.4s, v23.4s -ldr q23, [x0, #448] -mul v18.4S, v18.4S,v27.s[0] -mul v10.4S, v10.4S,v27.s[0] -sub v29.4s, v30.4s, v28.4s -ldr q13, [x0, #336] -add v30.4s, v30.4s, v28.4s -ldr q28, [x0, #464] -mla v18.4S, v15.4S, v31.s[0] -mla v10.4S, v0.4S, v31.s[0] -sub v0.4s, v5.4s, v6.4s -add v5.4s, v5.4s, v6.4s -sqrdmulh v6.4S, v30.4S, v4.s[1] -mul v30.4S, v30.4S,v16.s[1] -sub v15.4s, v13.4s, v2.4s -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v29.4S, v4.s[2] -mul v29.4S, v29.4S,v16.s[2] -sub v11.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -sqrdmulh v4.4S, v13.4S, v22.s[1] -mul v13.4S, v13.4S,v26.s[1] -sub v12.4s, v21.4s, v1.4s -add v21.4s, v21.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v22.s[2] -mul v15.4S, v15.4S,v26.s[2] -sub v16.4s, v23.4s, v18.4s -add v23.4s, v23.4s, v18.4s -mla v30.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v21.4S, v14.s[1] -sub v22.4s, v28.4s, v10.4s -ldr q18, [x0, #736] -add v28.4s, v28.4s, v10.4s -mla v29.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v12.4S, v14.s[2] -sub v10.4s, v25.4s, v30.4s -ldr q26, [x0, #672] -str q10, [x0, #272] -mla v13.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v28.4S, v19.s[1] -add v25.4s, v25.4s, v30.4s -ldr q30, [x0, #544] -str q25, [x0, #256] -mla v15.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v22.4S, v19.s[2] -sub v25.4s, v20.4s, v29.4s -ldr q10, [x17, #+384] -str q25, [x0, #304] -mul v21.4S, v21.4S,v24.s[1] -mul v12.4S, v12.4S,v24.s[2] -add v20.4s, v20.4s, v29.4s -str q20, [x0, #288] -ldr q20, [x17, #+400] -mla v21.4S, v6.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v5.4s, v13.4s -str q2, [x0, #336] -mul v28.4S, v28.4S,v27.s[1] -mul v22.4S, v22.4S,v27.s[2] -add v5.4s, v5.4s, v13.4s -str q5, [x0, #320] -mla v28.4S, v4.4S, v31.s[0] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v0.4s, v15.4s -str q1, [x0, #368] -sqrdmulh v19.4S, v30.4S, v20.s[0] -mul v30.4S, v30.4S,v10.s[0] -add v0.4s, v0.4s, v15.4s -ldr q15, [x0, #560] -str q0, [x0, #352] -ldr q0, [x17, #+416] -sqrdmulh v1.4S, v15.4S, v20.s[0] -mul v15.4S, v15.4S,v10.s[0] -sub v27.4s, v3.4s, v21.4s -ldr q4, [x17, #+432] -str q27, [x0, #400] -ldr q27, [x0, #608] -sqrdmulh v5.4S, v27.4S, v4.s[0] -mul v27.4S, v27.4S,v0.s[0] -add v3.4s, v3.4s, v21.4s -str q3, [x0, #384] -ldr q3, [x0, #624] -sqrdmulh v21.4S, v3.4S, v4.s[0] -mul v3.4S, v3.4S,v0.s[0] -sub v13.4s, v11.4s, v12.4s -ldr q14, [x17, #+448] -str q13, [x0, #432] -ldr q13, [x17, #+464] -mla v30.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v26.4S, v13.s[0] -add v11.4s, v11.4s, v12.4s -ldr q12, [x0, #688] -str q11, [x0, #416] -mla v15.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v12.4S, v13.s[0] -sub v11.4s, v23.4s, v28.4s -ldr q2, [x17, #+480] -str q11, [x0, #464] -ldr q11, [x17, #+496] -mla v27.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v18.4S, v11.s[0] -add v23.4s, v23.4s, v28.4s -str q23, [x0, #448] -ldr q23, [x0, #752] -mla v3.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v23.4S, v11.s[0] -sub v28.4s, v16.4s, v22.4s -ldr q24, [x0, #512] -str q28, [x0, #496] -ldr q28, [x0, #640] -mul v26.4S, v26.4S,v14.s[0] -mul v12.4S, v12.4S,v14.s[0] -add v16.4s, v16.4s, v22.4s -ldr q22, [x0, #528] -str q16, [x0, #480] -ldr q16, [x0, #656] -mla v26.4S, v19.4S, v31.s[0] -mla v12.4S, v1.4S, v31.s[0] -sub v1.4s, v24.4s, v30.4s -ldr q19, [x0, #576] -add v24.4s, v24.4s, v30.4s -ldr q30, [x0, #704] -mul v18.4S, v18.4S,v2.s[0] -mul v23.4S, v23.4S,v2.s[0] -sub v6.4s, v22.4s, v15.4s -ldr q29, [x0, #592] -add v22.4s, v22.4s, v15.4s -ldr q15, [x0, #720] -mla v18.4S, v5.4S, v31.s[0] -mla v23.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v27.4s -add v19.4s, v19.4s, v27.4s -sqrdmulh v27.4S, v22.4S, v20.s[1] -mul v22.4S, v22.4S,v10.s[1] -sub v5.4s, v29.4s, v3.4s -add v29.4s, v29.4s, v3.4s -sqrdmulh v3.4S, v6.4S, v20.s[2] -mul v6.4S, v6.4S,v10.s[2] -sub v25.4s, v28.4s, v26.4s -add v28.4s, v28.4s, v26.4s -sqrdmulh v20.4S, v29.4S, v4.s[1] -mul v29.4S, v29.4S,v0.s[1] -sub v26.4s, v16.4s, v12.4s -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v5.4S, v4.s[2] -mul v5.4S, v5.4S,v0.s[2] -sub v10.4s, v30.4s, v18.4s -add v30.4s, v30.4s, v18.4s -mla v22.4S, v27.4S, v31.s[0] -sqrdmulh v27.4S, v16.4S, v13.s[1] -sub v4.4s, v15.4s, v23.4s -ldr q18, [x0, #992] -add v15.4s, v15.4s, v23.4s -mla v6.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v26.4S, v13.s[2] -sub v23.4s, v24.4s, v22.4s -ldr q0, [x0, #928] -str q23, [x0, #528] -mla v29.4S, v20.4S, v31.s[0] -sqrdmulh v20.4S, v15.4S, v11.s[1] -add v24.4s, v24.4s, v22.4s -ldr q22, [x0, #800] -str q24, [x0, #512] -mla v5.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v4.4S, v11.s[2] -sub v24.4s, v1.4s, v6.4s -ldr q23, [x17, #+512] -str q24, [x0, #560] -mul v16.4S, v16.4S,v14.s[1] -mul v26.4S, v26.4S,v14.s[2] -add v1.4s, v1.4s, v6.4s -str q1, [x0, #544] -ldr q1, [x17, #+528] -mla v16.4S, v27.4S, v31.s[0] -mla v26.4S, v3.4S, v31.s[0] -sub v3.4s, v19.4s, v29.4s -str q3, [x0, #592] -mul v15.4S, v15.4S,v2.s[1] -mul v4.4S, v4.4S,v2.s[2] -add v19.4s, v19.4s, v29.4s -str q19, [x0, #576] -mla v15.4S, v20.4S, v31.s[0] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v21.4s, v5.4s -str q12, [x0, #624] -sqrdmulh v11.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v23.s[0] -add v21.4s, v21.4s, v5.4s -ldr q5, [x0, #816] -str q21, [x0, #608] -ldr q21, [x17, #+544] -sqrdmulh v12.4S, v5.4S, v1.s[0] -mul v5.4S, v5.4S,v23.s[0] -sub v2.4s, v28.4s, v16.4s -ldr q20, [x17, #+560] -str q2, [x0, #656] -ldr q2, [x0, #864] -sqrdmulh v19.4S, v2.4S, v20.s[0] -mul v2.4S, v2.4S,v21.s[0] -add v28.4s, v28.4s, v16.4s -str q28, [x0, #640] -ldr q28, [x0, #880] -sqrdmulh v16.4S, v28.4S, v20.s[0] -mul v28.4S, v28.4S,v21.s[0] -sub v29.4s, v25.4s, v26.4s -ldr q13, [x17, #+576] -str q29, [x0, #688] -ldr q29, [x17, #+592] -mla v22.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v0.4S, v29.s[0] -add v25.4s, v25.4s, v26.4s -ldr q26, [x0, #944] -str q25, [x0, #672] -mla v5.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v26.4S, v29.s[0] -sub v25.4s, v30.4s, v15.4s -ldr q3, [x17, #+608] -str q25, [x0, #720] -ldr q25, [x17, #+624] -mla v2.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v18.4S, v25.s[0] -add v30.4s, v30.4s, v15.4s -str q30, [x0, #704] -ldr q30, [x0, #1008] -mla v28.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v30.4S, v25.s[0] -sub v15.4s, v10.4s, v4.4s -ldr q14, [x0, #768] -str q15, [x0, #752] -ldr q15, [x0, #896] -mul v0.4S, v0.4S,v13.s[0] -mul v26.4S, v26.4S,v13.s[0] -add v10.4s, v10.4s, v4.4s -ldr q4, [x0, #784] -str q10, [x0, #736] -ldr q10, [x0, #912] -mla v0.4S, v11.4S, v31.s[0] -mla v26.4S, v12.4S, v31.s[0] -sub v12.4s, v14.4s, v22.4s -ldr q11, [x0, #832] -add v14.4s, v14.4s, v22.4s -ldr q22, [x0, #960] -mul v18.4S, v18.4S,v3.s[0] -mul v30.4S, v30.4S,v3.s[0] -sub v27.4s, v4.4s, v5.4s -ldr q6, [x0, #848] -add v4.4s, v4.4s, v5.4s -ldr q5, [x0, #976] -mla v18.4S, v19.4S, v31.s[0] -mla v30.4S, v16.4S, v31.s[0] -sub v16.4s, v11.4s, v2.4s -add v11.4s, v11.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v1.s[1] -mul v4.4S, v4.4S,v23.s[1] -sub v19.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -sqrdmulh v28.4S, v27.4S, v1.s[2] -mul v27.4S, v27.4S,v23.s[2] -sub v24.4s, v15.4s, v0.4s -add v15.4s, v15.4s, v0.4s -sqrdmulh v1.4S, v6.4S, v20.s[1] -mul v6.4S, v6.4S,v21.s[1] -sub v0.4s, v10.4s, v26.4s -add v10.4s, v10.4s, v26.4s -sqrdmulh v26.4S, v19.4S, v20.s[2] -mul v19.4S, v19.4S,v21.s[2] -sub v23.4s, v22.4s, v18.4s -add v22.4s, v22.4s, v18.4s -mla v4.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v10.4S, v29.s[1] -sub v20.4s, v5.4s, v30.4s -add v5.4s, v5.4s, v30.4s -mla v27.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v0.4S, v29.s[2] -sub v30.4s, v14.4s, v4.4s -str q30, [x0, #784] -mla v6.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v5.4S, v25.s[1] -add v14.4s, v14.4s, v4.4s -str q14, [x0, #768] -mla v19.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v20.4S, v25.s[2] -sub v14.4s, v12.4s, v27.4s -str q14, [x0, #816] -mul v10.4S, v10.4S,v13.s[1] -mul v0.4S, v0.4S,v13.s[2] -add v12.4s, v12.4s, v27.4s -str q12, [x0, #800] -mla v10.4S, v2.4S, v31.s[0] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v6.4s -str q28, [x0, #848] -mul v5.4S, v5.4S,v3.s[1] -mul v20.4S, v20.4S,v3.s[2] -add v11.4s, v11.4s, v6.4s -str q11, [x0, #832] -mla v5.4S, v1.4S, v31.s[0] -mla v20.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v19.4s -str q26, [x0, #880] -add v16.4s, v16.4s, v19.4s -str q16, [x0, #864] -sub v16.4s, v15.4s, v10.4s -str q16, [x0, #912] -add v15.4s, v15.4s, v10.4s -str q15, [x0, #896] -sub v15.4s, v24.4s, v0.4s -str q15, [x0, #944] -add v24.4s, v24.4s, v0.4s -str q24, [x0, #928] -sub v24.4s, v22.4s, v5.4s -str q24, [x0, #976] -add v22.4s, v22.4s, v5.4s -str q22, [x0, #960] -sub v22.4s, v23.4s, v20.4s -str q22, [x0, #1008] -add v23.4s, v23.4s, v20.4s -str q23, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s deleted file mode 100644 index b48ef69..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_2.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_2: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x17, #+128] -ldr q1, [x17, #+144] -ldr q10, [x17, #+160] -ldr q21, [x17, #+176] -ldr q22, [x17, #+192] -ldr q15, [x17, #+208] -ldr q3, [x17, #+224] -ldr q12, [x17, #+240] -ldr q4, [x0, #32] -ldr q30, [x0, #48] -ldr q29, [x0, #0] -ldr q28, [x0, #96] -ldr q27, [x0, #112] -ldr q26, [x0, #64] -ldr q25, [x0, #160] -ldr q24, [x0, #176] -ldr q20, [x0, #128] -ldr q6, [x0, #224] -ldr q5, [x0, #240] -ldr q16, [x0, #192] -sqrdmulh v11.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -ldr q4, [x0, #16] -sqrdmulh v2.4S, v28.4S, v21.s[0] -mul v28.4S, v28.4S,v10.s[0] -mla v28.4S, v2.4S, v31.s[0] -sub v2.4s, v26.4s, v28.4s -add v26.4s, v26.4s, v28.4s -ldr q28, [x0, #80] -sqrdmulh v23.4S, v25.4S, v15.s[0] -mul v25.4S, v25.4S,v22.s[0] -mla v25.4S, v23.4S, v31.s[0] -sub v23.4s, v20.4s, v25.4s -add v20.4s, v20.4s, v25.4s -ldr q25, [x0, #144] -sqrdmulh v0.4S, v6.4S, v12.s[0] -mul v6.4S, v6.4S,v3.s[0] -mla v6.4S, v0.4S, v31.s[0] -sub v0.4s, v16.4s, v6.4s -add v16.4s, v16.4s, v6.4s -ldr q6, [x0, #208] -sqrdmulh v13.4S, v30.4S, v1.s[0] -mul v30.4S, v30.4S,v18.s[0] -mla v30.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v30.4s -add v4.4s, v4.4s, v30.4s -sqrdmulh v30.4S, v27.4S, v21.s[0] -mul v27.4S, v27.4S,v10.s[0] -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v28.4s, v27.4s -add v28.4s, v28.4s, v27.4s -sqrdmulh v27.4S, v24.4S, v15.s[0] -mul v24.4S, v24.4S,v22.s[0] -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v25.4s, v24.4s -add v25.4s, v25.4s, v24.4s -sqrdmulh v24.4S, v5.4S, v12.s[0] -mul v5.4S, v5.4S,v3.s[0] -mla v5.4S, v24.4S, v31.s[0] -sub v24.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v1.s[1] -mul v4.4S, v4.4S,v18.s[1] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v4.4s -add v29.4s, v29.4s, v4.4s -sqrdmulh v4.4S, v28.4S, v21.s[1] -mul v28.4S, v28.4S,v10.s[1] -mla v28.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v28.4s -add v26.4s, v26.4s, v28.4s -str q29, [x0, #0] -str q5, [x0, #16] -sqrdmulh v5.4S, v25.4S, v15.s[1] -mul v25.4S, v25.4S,v22.s[1] -mla v25.4S, v5.4S, v31.s[0] -sub v5.4s, v20.4s, v25.4s -add v20.4s, v20.4s, v25.4s -str q26, [x0, #64] -str q4, [x0, #80] -sqrdmulh v4.4S, v6.4S, v12.s[1] -mul v6.4S, v6.4S,v3.s[1] -mla v6.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v6.4s -add v16.4s, v16.4s, v6.4s -str q20, [x0, #128] -str q5, [x0, #144] -sqrdmulh v5.4S, v13.4S, v1.s[2] -mul v13.4S, v13.4S,v18.s[2] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -str q16, [x0, #192] -str q4, [x0, #208] -ldr q1, [x17, #+256] -ldr q18, [x17, #+272] -sqrdmulh v4.4S, v30.4S, v21.s[2] -mul v30.4S, v30.4S,v10.s[2] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v30.4s -add v2.4s, v2.4s, v30.4s -ldr q21, [x17, #+288] -ldr q10, [x17, #+304] -sqrdmulh v30.4S, v27.4S, v15.s[2] -mul v27.4S, v27.4S,v22.s[2] -mla v27.4S, v30.4S, v31.s[0] -sub v30.4s, v23.4s, v27.4s -add v23.4s, v23.4s, v27.4s -ldr q15, [x17, #+320] -ldr q22, [x17, #+336] -sqrdmulh v27.4S, v24.4S, v12.s[2] -mul v24.4S, v24.4S,v3.s[2] -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v0.4s, v24.4s -add v0.4s, v0.4s, v24.4s -ldr q12, [x17, #+352] -ldr q3, [x17, #+368] -str q11, [x0, #32] -str q5, [x0, #48] -str q2, [x0, #96] -str q4, [x0, #112] -str q23, [x0, #160] -str q30, [x0, #176] -str q0, [x0, #224] -str q27, [x0, #240] -ldr q27, [x0, #288] -ldr q0, [x0, #304] -ldr q30, [x0, #256] -ldr q23, [x0, #352] -ldr q4, [x0, #368] -ldr q2, [x0, #320] -ldr q5, [x0, #416] -ldr q11, [x0, #432] -ldr q24, [x0, #384] -ldr q16, [x0, #480] -ldr q13, [x0, #496] -ldr q20, [x0, #448] -sqrdmulh v6.4S, v27.4S, v18.s[0] -mul v27.4S, v27.4S,v1.s[0] -mla v27.4S, v6.4S, v31.s[0] -sub v6.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -ldr q27, [x0, #272] -sqrdmulh v26.4S, v23.4S, v10.s[0] -mul v23.4S, v23.4S,v21.s[0] -mla v23.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v23.4s -add v2.4s, v2.4s, v23.4s -ldr q23, [x0, #336] -sqrdmulh v25.4S, v5.4S, v22.s[0] -mul v5.4S, v5.4S,v15.s[0] -mla v5.4S, v25.4S, v31.s[0] -sub v25.4s, v24.4s, v5.4s -add v24.4s, v24.4s, v5.4s -ldr q5, [x0, #400] -sqrdmulh v29.4S, v16.4S, v3.s[0] -mul v16.4S, v16.4S,v12.s[0] -mla v16.4S, v29.4S, v31.s[0] -sub v29.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -ldr q16, [x0, #464] -sqrdmulh v28.4S, v0.4S, v18.s[0] -mul v0.4S, v0.4S,v1.s[0] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v27.4s, v0.4s -add v27.4s, v27.4s, v0.4s -sqrdmulh v0.4S, v4.4S, v10.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v23.4s, v4.4s -add v23.4s, v23.4s, v4.4s -sqrdmulh v4.4S, v11.4S, v22.s[0] -mul v11.4S, v11.4S,v15.s[0] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v5.4s, v11.4s -add v5.4s, v5.4s, v11.4s -sqrdmulh v11.4S, v13.4S, v3.s[0] -mul v13.4S, v13.4S,v12.s[0] -mla v13.4S, v11.4S, v31.s[0] -sub v11.4s, v16.4s, v13.4s -add v16.4s, v16.4s, v13.4s -sqrdmulh v13.4S, v27.4S, v18.s[1] -mul v27.4S, v27.4S,v1.s[1] -mla v27.4S, v13.4S, v31.s[0] -sub v13.4s, v30.4s, v27.4s -add v30.4s, v30.4s, v27.4s -sqrdmulh v27.4S, v23.4S, v10.s[1] -mul v23.4S, v23.4S,v21.s[1] -mla v23.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v23.4s -add v2.4s, v2.4s, v23.4s -str q30, [x0, #256] -str q13, [x0, #272] -sqrdmulh v13.4S, v5.4S, v22.s[1] -mul v5.4S, v5.4S,v15.s[1] -mla v5.4S, v13.4S, v31.s[0] -sub v13.4s, v24.4s, v5.4s -add v24.4s, v24.4s, v5.4s -str q2, [x0, #320] -str q27, [x0, #336] -sqrdmulh v27.4S, v16.4S, v3.s[1] -mul v16.4S, v16.4S,v12.s[1] -mla v16.4S, v27.4S, v31.s[0] -sub v27.4s, v20.4s, v16.4s -add v20.4s, v20.4s, v16.4s -str q24, [x0, #384] -str q13, [x0, #400] -sqrdmulh v13.4S, v28.4S, v18.s[2] -mul v28.4S, v28.4S,v1.s[2] -mla v28.4S, v13.4S, v31.s[0] -sub v13.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -str q20, [x0, #448] -str q27, [x0, #464] -ldr q18, [x17, #+384] -ldr q1, [x17, #+400] -sqrdmulh v27.4S, v0.4S, v10.s[2] -mul v0.4S, v0.4S,v21.s[2] -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v26.4s, v0.4s -add v26.4s, v26.4s, v0.4s -ldr q10, [x17, #+416] -ldr q21, [x17, #+432] -sqrdmulh v0.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v15.s[2] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v25.4s, v4.4s -add v25.4s, v25.4s, v4.4s -ldr q22, [x17, #+448] -ldr q15, [x17, #+464] -sqrdmulh v4.4S, v11.4S, v3.s[2] -mul v11.4S, v11.4S,v12.s[2] -mla v11.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v11.4s -add v29.4s, v29.4s, v11.4s -ldr q3, [x17, #+480] -ldr q12, [x17, #+496] -str q6, [x0, #288] -str q13, [x0, #304] -str q26, [x0, #352] -str q27, [x0, #368] -str q25, [x0, #416] -str q0, [x0, #432] -str q29, [x0, #480] -str q4, [x0, #496] -ldr q4, [x0, #544] -ldr q29, [x0, #560] -ldr q0, [x0, #512] -ldr q25, [x0, #608] -ldr q27, [x0, #624] -ldr q26, [x0, #576] -ldr q13, [x0, #672] -ldr q6, [x0, #688] -ldr q11, [x0, #640] -ldr q20, [x0, #736] -ldr q28, [x0, #752] -ldr q24, [x0, #704] -sqrdmulh v16.4S, v4.4S, v1.s[0] -mul v4.4S, v4.4S,v18.s[0] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -ldr q4, [x0, #528] -sqrdmulh v2.4S, v25.4S, v21.s[0] -mul v25.4S, v25.4S,v10.s[0] -mla v25.4S, v2.4S, v31.s[0] -sub v2.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -ldr q25, [x0, #592] -sqrdmulh v5.4S, v13.4S, v15.s[0] -mul v13.4S, v13.4S,v22.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -ldr q13, [x0, #656] -sqrdmulh v30.4S, v20.4S, v12.s[0] -mul v20.4S, v20.4S,v3.s[0] -mla v20.4S, v30.4S, v31.s[0] -sub v30.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -ldr q20, [x0, #720] -sqrdmulh v23.4S, v29.4S, v1.s[0] -mul v29.4S, v29.4S,v18.s[0] -mla v29.4S, v23.4S, v31.s[0] -sub v23.4s, v4.4s, v29.4s -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v27.4S, v21.s[0] -mul v27.4S, v27.4S,v10.s[0] -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v25.4s, v27.4s -add v25.4s, v25.4s, v27.4s -sqrdmulh v27.4S, v6.4S, v15.s[0] -mul v6.4S, v6.4S,v22.s[0] -mla v6.4S, v27.4S, v31.s[0] -sub v27.4s, v13.4s, v6.4s -add v13.4s, v13.4s, v6.4s -sqrdmulh v6.4S, v28.4S, v12.s[0] -mul v28.4S, v28.4S,v3.s[0] -mla v28.4S, v6.4S, v31.s[0] -sub v6.4s, v20.4s, v28.4s -add v20.4s, v20.4s, v28.4s -sqrdmulh v28.4S, v4.4S, v1.s[1] -mul v4.4S, v4.4S,v18.s[1] -mla v4.4S, v28.4S, v31.s[0] -sub v28.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v21.s[1] -mul v25.4S, v25.4S,v10.s[1] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v25.4s -add v26.4s, v26.4s, v25.4s -str q0, [x0, #512] -str q28, [x0, #528] -sqrdmulh v28.4S, v13.4S, v15.s[1] -mul v13.4S, v13.4S,v22.s[1] -mla v13.4S, v28.4S, v31.s[0] -sub v28.4s, v11.4s, v13.4s -add v11.4s, v11.4s, v13.4s -str q26, [x0, #576] -str q4, [x0, #592] -sqrdmulh v4.4S, v20.4S, v12.s[1] -mul v20.4S, v20.4S,v3.s[1] -mla v20.4S, v4.4S, v31.s[0] -sub v4.4s, v24.4s, v20.4s -add v24.4s, v24.4s, v20.4s -str q11, [x0, #640] -str q28, [x0, #656] -sqrdmulh v28.4S, v23.4S, v1.s[2] -mul v23.4S, v23.4S,v18.s[2] -mla v23.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v23.4s -add v16.4s, v16.4s, v23.4s -str q24, [x0, #704] -str q4, [x0, #720] -ldr q1, [x17, #+512] -ldr q18, [x17, #+528] -sqrdmulh v4.4S, v29.4S, v21.s[2] -mul v29.4S, v29.4S,v10.s[2] -mla v29.4S, v4.4S, v31.s[0] -sub v4.4s, v2.4s, v29.4s -add v2.4s, v2.4s, v29.4s -ldr q21, [x17, #+544] -ldr q10, [x17, #+560] -sqrdmulh v29.4S, v27.4S, v15.s[2] -mul v27.4S, v27.4S,v22.s[2] -mla v27.4S, v29.4S, v31.s[0] -sub v29.4s, v5.4s, v27.4s -add v5.4s, v5.4s, v27.4s -ldr q15, [x17, #+576] -ldr q22, [x17, #+592] -sqrdmulh v27.4S, v6.4S, v12.s[2] -mul v6.4S, v6.4S,v3.s[2] -mla v6.4S, v27.4S, v31.s[0] -sub v27.4s, v30.4s, v6.4s -add v30.4s, v30.4s, v6.4s -ldr q12, [x17, #+608] -ldr q3, [x17, #+624] -str q16, [x0, #544] -str q28, [x0, #560] -str q2, [x0, #608] -str q4, [x0, #624] -str q5, [x0, #672] -str q29, [x0, #688] -str q30, [x0, #736] -str q27, [x0, #752] -ldr q27, [x0, #800] -ldr q30, [x0, #816] -ldr q29, [x0, #768] -ldr q5, [x0, #864] -ldr q4, [x0, #880] -ldr q2, [x0, #832] -ldr q28, [x0, #928] -ldr q16, [x0, #944] -ldr q6, [x0, #896] -ldr q24, [x0, #992] -ldr q23, [x0, #1008] -ldr q11, [x0, #960] -sqrdmulh v20.4S, v27.4S, v18.s[0] -mul v27.4S, v27.4S,v1.s[0] -mla v27.4S, v20.4S, v31.s[0] -sub v20.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -ldr q27, [x0, #784] -sqrdmulh v26.4S, v5.4S, v10.s[0] -mul v5.4S, v5.4S,v21.s[0] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -ldr q5, [x0, #848] -sqrdmulh v13.4S, v28.4S, v22.s[0] -mul v28.4S, v28.4S,v15.s[0] -mla v28.4S, v13.4S, v31.s[0] -sub v13.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -ldr q28, [x0, #912] -sqrdmulh v0.4S, v24.4S, v3.s[0] -mul v24.4S, v24.4S,v12.s[0] -mla v24.4S, v0.4S, v31.s[0] -sub v0.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -ldr q24, [x0, #976] -sqrdmulh v25.4S, v30.4S, v18.s[0] -mul v30.4S, v30.4S,v1.s[0] -mla v30.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v30.4s -add v27.4s, v27.4s, v30.4s -sqrdmulh v30.4S, v4.4S, v10.s[0] -mul v4.4S, v4.4S,v21.s[0] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v5.4s, v4.4s -add v5.4s, v5.4s, v4.4s -sqrdmulh v4.4S, v16.4S, v22.s[0] -mul v16.4S, v16.4S,v15.s[0] -mla v16.4S, v4.4S, v31.s[0] -sub v4.4s, v28.4s, v16.4s -add v28.4s, v28.4s, v16.4s -sqrdmulh v16.4S, v23.4S, v3.s[0] -mul v23.4S, v23.4S,v12.s[0] -mla v23.4S, v16.4S, v31.s[0] -sub v16.4s, v24.4s, v23.4s -add v24.4s, v24.4s, v23.4s -sqrdmulh v23.4S, v27.4S, v18.s[1] -mul v27.4S, v27.4S,v1.s[1] -mla v27.4S, v23.4S, v31.s[0] -sub v23.4s, v29.4s, v27.4s -add v29.4s, v29.4s, v27.4s -sqrdmulh v27.4S, v5.4S, v10.s[1] -mul v5.4S, v5.4S,v21.s[1] -mla v5.4S, v27.4S, v31.s[0] -sub v27.4s, v2.4s, v5.4s -add v2.4s, v2.4s, v5.4s -str q29, [x0, #768] -str q23, [x0, #784] -sqrdmulh v23.4S, v28.4S, v22.s[1] -mul v28.4S, v28.4S,v15.s[1] -mla v28.4S, v23.4S, v31.s[0] -sub v23.4s, v6.4s, v28.4s -add v6.4s, v6.4s, v28.4s -str q2, [x0, #832] -str q27, [x0, #848] -sqrdmulh v27.4S, v24.4S, v3.s[1] -mul v24.4S, v24.4S,v12.s[1] -mla v24.4S, v27.4S, v31.s[0] -sub v27.4s, v11.4s, v24.4s -add v11.4s, v11.4s, v24.4s -str q6, [x0, #896] -str q23, [x0, #912] -sqrdmulh v23.4S, v25.4S, v18.s[2] -mul v25.4S, v25.4S,v1.s[2] -mla v25.4S, v23.4S, v31.s[0] -sub v23.4s, v20.4s, v25.4s -add v20.4s, v20.4s, v25.4s -str q11, [x0, #960] -str q27, [x0, #976] -sqrdmulh v18.4S, v30.4S, v10.s[2] -mul v30.4S, v30.4S,v21.s[2] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v26.4s, v30.4s -add v26.4s, v26.4s, v30.4s -sqrdmulh v10.4S, v4.4S, v22.s[2] -mul v4.4S, v4.4S,v15.s[2] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v13.4s, v4.4s -add v13.4s, v13.4s, v4.4s -sqrdmulh v22.4S, v16.4S, v3.s[2] -mul v16.4S, v16.4S,v12.s[2] -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v0.4s, v16.4s -add v0.4s, v0.4s, v16.4s -str q20, [x0, #800] -str q23, [x0, #816] -str q26, [x0, #864] -str q18, [x0, #880] -str q13, [x0, #928] -str q10, [x0, #944] -str q0, [x0, #992] -str q22, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s deleted file mode 100644 index 11b1cd1..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_3.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_3: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x17, #+128] -ldr q1, [x17, #+144] -ldr q10, [x17, #+160] -ldr q21, [x17, #+176] -ldr q22, [x17, #+192] -ldr q15, [x17, #+208] -ldr q3, [x17, #+224] -ldr q12, [x17, #+240] -ldr q4, [x0, #32] -ldr q30, [x0, #48] -ldr q29, [x0, #0] -ldr q28, [x0, #96] -ldr q27, [x0, #112] -ldr q26, [x0, #64] -ldr q25, [x0, #160] -ldr q24, [x0, #176] -ldr q20, [x0, #128] -ldr q6, [x0, #224] -ldr q5, [x0, #240] -ldr q16, [x0, #192] -sqrdmulh v11.4S, v4.4S, v1.s[0] -sqrdmulh v2.4S, v28.4S, v21.s[0] -sqrdmulh v23.4S, v25.4S, v15.s[0] -sqrdmulh v0.4S, v6.4S, v12.s[0] -mul v4.4S, v4.4S,v18.s[0] -mul v28.4S, v28.4S,v10.s[0] -mul v25.4S, v25.4S,v22.s[0] -mul v6.4S, v6.4S,v3.s[0] -mla v4.4S, v11.4S, v31.s[0] -mla v28.4S, v2.4S, v31.s[0] -mla v25.4S, v23.4S, v31.s[0] -mla v6.4S, v0.4S, v31.s[0] -sub v0.4s, v29.4s, v4.4s -sub v23.4s, v26.4s, v28.4s -sub v2.4s, v20.4s, v25.4s -sub v11.4s, v16.4s, v6.4s -add v29.4s, v29.4s, v4.4s -add v26.4s, v26.4s, v28.4s -add v20.4s, v20.4s, v25.4s -add v16.4s, v16.4s, v6.4s -ldr q6, [x0, #16] -ldr q25, [x0, #80] -ldr q28, [x0, #144] -ldr q4, [x0, #208] -sqrdmulh v13.4S, v30.4S, v1.s[0] -sqrdmulh v14.4S, v27.4S, v21.s[0] -sqrdmulh v19.4S, v24.4S, v15.s[0] -sqrdmulh v17.4S, v5.4S, v12.s[0] -mul v30.4S, v30.4S,v18.s[0] -mul v27.4S, v27.4S,v10.s[0] -mul v24.4S, v24.4S,v22.s[0] -mul v5.4S, v5.4S,v3.s[0] -mla v30.4S, v13.4S, v31.s[0] -mla v27.4S, v14.4S, v31.s[0] -mla v24.4S, v19.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -sub v17.4s, v6.4s, v30.4s -sub v19.4s, v25.4s, v27.4s -sub v14.4s, v28.4s, v24.4s -sub v13.4s, v4.4s, v5.4s -add v6.4s, v6.4s, v30.4s -add v25.4s, v25.4s, v27.4s -add v28.4s, v28.4s, v24.4s -add v4.4s, v4.4s, v5.4s -sqrdmulh v5.4S, v6.4S, v1.s[1] -sqrdmulh v24.4S, v25.4S, v21.s[1] -sqrdmulh v27.4S, v28.4S, v15.s[1] -sqrdmulh v30.4S, v4.4S, v12.s[1] -mul v6.4S, v6.4S,v18.s[1] -mul v25.4S, v25.4S,v10.s[1] -mul v28.4S, v28.4S,v22.s[1] -mul v4.4S, v4.4S,v3.s[1] -mla v6.4S, v5.4S, v31.s[0] -mla v25.4S, v24.4S, v31.s[0] -mla v28.4S, v27.4S, v31.s[0] -mla v4.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v6.4s -sub v27.4s, v26.4s, v25.4s -sub v24.4s, v20.4s, v28.4s -sub v5.4s, v16.4s, v4.4s -add v29.4s, v29.4s, v6.4s -add v26.4s, v26.4s, v25.4s -add v20.4s, v20.4s, v28.4s -add v16.4s, v16.4s, v4.4s -sqrdmulh v4.4S, v17.4S, v1.s[2] -sqrdmulh v28.4S, v19.4S, v21.s[2] -sqrdmulh v25.4S, v14.4S, v15.s[2] -sqrdmulh v6.4S, v13.4S, v12.s[2] -str q29, [x0, #0] -str q30, [x0, #16] -mul v17.4S, v17.4S,v18.s[2] -mul v19.4S, v19.4S,v10.s[2] -mul v14.4S, v14.4S,v22.s[2] -mul v13.4S, v13.4S,v3.s[2] -str q26, [x0, #64] -str q27, [x0, #80] -ldr q12, [x17, #+256] -ldr q3, [x17, #+272] -ldr q15, [x17, #+288] -ldr q22, [x17, #+304] -mla v17.4S, v4.4S, v31.s[0] -mla v19.4S, v28.4S, v31.s[0] -mla v14.4S, v25.4S, v31.s[0] -mla v13.4S, v6.4S, v31.s[0] -str q20, [x0, #128] -str q24, [x0, #144] -ldr q24, [x17, #+320] -ldr q20, [x17, #+336] -sub v6.4s, v0.4s, v17.4s -sub v25.4s, v23.4s, v19.4s -sub v28.4s, v2.4s, v14.4s -sub v4.4s, v11.4s, v13.4s -str q16, [x0, #192] -str q5, [x0, #208] -ldr q5, [x17, #+352] -ldr q16, [x17, #+368] -add v0.4s, v0.4s, v17.4s -add v23.4s, v23.4s, v19.4s -add v2.4s, v2.4s, v14.4s -add v11.4s, v11.4s, v13.4s -str q0, [x0, #32] -str q23, [x0, #96] -str q2, [x0, #160] -str q11, [x0, #224] -ldr q11, [x0, #288] -ldr q2, [x0, #304] -ldr q23, [x0, #256] -ldr q0, [x0, #352] -ldr q13, [x0, #368] -ldr q14, [x0, #320] -ldr q19, [x0, #416] -ldr q17, [x0, #432] -ldr q21, [x0, #384] -ldr q10, [x0, #480] -ldr q1, [x0, #496] -ldr q18, [x0, #448] -sqrdmulh v27.4S, v11.4S, v3.s[0] -sqrdmulh v26.4S, v0.4S, v22.s[0] -sqrdmulh v30.4S, v19.4S, v20.s[0] -sqrdmulh v29.4S, v10.4S, v16.s[0] -str q6, [x0, #48] -mul v11.4S, v11.4S,v12.s[0] -mul v0.4S, v0.4S,v15.s[0] -mul v19.4S, v19.4S,v24.s[0] -mul v10.4S, v10.4S,v5.s[0] -str q25, [x0, #112] -mla v11.4S, v27.4S, v31.s[0] -mla v0.4S, v26.4S, v31.s[0] -mla v19.4S, v30.4S, v31.s[0] -mla v10.4S, v29.4S, v31.s[0] -str q28, [x0, #176] -sub v28.4s, v23.4s, v11.4s -sub v29.4s, v14.4s, v0.4s -sub v30.4s, v21.4s, v19.4s -sub v26.4s, v18.4s, v10.4s -str q4, [x0, #240] -add v23.4s, v23.4s, v11.4s -add v14.4s, v14.4s, v0.4s -add v21.4s, v21.4s, v19.4s -add v18.4s, v18.4s, v10.4s -ldr q10, [x0, #272] -ldr q19, [x0, #336] -ldr q0, [x0, #400] -ldr q11, [x0, #464] -sqrdmulh v4.4S, v2.4S, v3.s[0] -sqrdmulh v27.4S, v13.4S, v22.s[0] -sqrdmulh v25.4S, v17.4S, v20.s[0] -sqrdmulh v6.4S, v1.4S, v16.s[0] -mul v2.4S, v2.4S,v12.s[0] -mul v13.4S, v13.4S,v15.s[0] -mul v17.4S, v17.4S,v24.s[0] -mul v1.4S, v1.4S,v5.s[0] -mla v2.4S, v4.4S, v31.s[0] -mla v13.4S, v27.4S, v31.s[0] -mla v17.4S, v25.4S, v31.s[0] -mla v1.4S, v6.4S, v31.s[0] -sub v6.4s, v10.4s, v2.4s -sub v25.4s, v19.4s, v13.4s -sub v27.4s, v0.4s, v17.4s -sub v4.4s, v11.4s, v1.4s -add v10.4s, v10.4s, v2.4s -add v19.4s, v19.4s, v13.4s -add v0.4s, v0.4s, v17.4s -add v11.4s, v11.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v3.s[1] -sqrdmulh v17.4S, v19.4S, v22.s[1] -sqrdmulh v13.4S, v0.4S, v20.s[1] -sqrdmulh v2.4S, v11.4S, v16.s[1] -mul v10.4S, v10.4S,v12.s[1] -mul v19.4S, v19.4S,v15.s[1] -mul v0.4S, v0.4S,v24.s[1] -mul v11.4S, v11.4S,v5.s[1] -mla v10.4S, v1.4S, v31.s[0] -mla v19.4S, v17.4S, v31.s[0] -mla v0.4S, v13.4S, v31.s[0] -mla v11.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v10.4s -sub v13.4s, v14.4s, v19.4s -sub v17.4s, v21.4s, v0.4s -sub v1.4s, v18.4s, v11.4s -add v23.4s, v23.4s, v10.4s -add v14.4s, v14.4s, v19.4s -add v21.4s, v21.4s, v0.4s -add v18.4s, v18.4s, v11.4s -sqrdmulh v11.4S, v6.4S, v3.s[2] -sqrdmulh v0.4S, v25.4S, v22.s[2] -sqrdmulh v19.4S, v27.4S, v20.s[2] -sqrdmulh v10.4S, v4.4S, v16.s[2] -str q23, [x0, #256] -str q2, [x0, #272] -mul v6.4S, v6.4S,v12.s[2] -mul v25.4S, v25.4S,v15.s[2] -mul v27.4S, v27.4S,v24.s[2] -mul v4.4S, v4.4S,v5.s[2] -str q14, [x0, #320] -str q13, [x0, #336] -ldr q16, [x17, #+384] -ldr q5, [x17, #+400] -ldr q20, [x17, #+416] -ldr q24, [x17, #+432] -mla v6.4S, v11.4S, v31.s[0] -mla v25.4S, v0.4S, v31.s[0] -mla v27.4S, v19.4S, v31.s[0] -mla v4.4S, v10.4S, v31.s[0] -str q21, [x0, #384] -str q17, [x0, #400] -ldr q17, [x17, #+448] -ldr q21, [x17, #+464] -sub v10.4s, v28.4s, v6.4s -sub v19.4s, v29.4s, v25.4s -sub v0.4s, v30.4s, v27.4s -sub v11.4s, v26.4s, v4.4s -str q18, [x0, #448] -str q1, [x0, #464] -ldr q1, [x17, #+480] -ldr q18, [x17, #+496] -add v28.4s, v28.4s, v6.4s -add v29.4s, v29.4s, v25.4s -add v30.4s, v30.4s, v27.4s -add v26.4s, v26.4s, v4.4s -str q28, [x0, #288] -str q29, [x0, #352] -str q30, [x0, #416] -str q26, [x0, #480] -ldr q26, [x0, #544] -ldr q30, [x0, #560] -ldr q29, [x0, #512] -ldr q28, [x0, #608] -ldr q4, [x0, #624] -ldr q27, [x0, #576] -ldr q25, [x0, #672] -ldr q6, [x0, #688] -ldr q22, [x0, #640] -ldr q15, [x0, #736] -ldr q3, [x0, #752] -ldr q12, [x0, #704] -sqrdmulh v13.4S, v26.4S, v5.s[0] -sqrdmulh v14.4S, v28.4S, v24.s[0] -sqrdmulh v2.4S, v25.4S, v21.s[0] -sqrdmulh v23.4S, v15.4S, v18.s[0] -str q10, [x0, #304] -mul v26.4S, v26.4S,v16.s[0] -mul v28.4S, v28.4S,v20.s[0] -mul v25.4S, v25.4S,v17.s[0] -mul v15.4S, v15.4S,v1.s[0] -str q19, [x0, #368] -mla v26.4S, v13.4S, v31.s[0] -mla v28.4S, v14.4S, v31.s[0] -mla v25.4S, v2.4S, v31.s[0] -mla v15.4S, v23.4S, v31.s[0] -str q0, [x0, #432] -sub v0.4s, v29.4s, v26.4s -sub v23.4s, v27.4s, v28.4s -sub v2.4s, v22.4s, v25.4s -sub v14.4s, v12.4s, v15.4s -str q11, [x0, #496] -add v29.4s, v29.4s, v26.4s -add v27.4s, v27.4s, v28.4s -add v22.4s, v22.4s, v25.4s -add v12.4s, v12.4s, v15.4s -ldr q15, [x0, #528] -ldr q25, [x0, #592] -ldr q28, [x0, #656] -ldr q26, [x0, #720] -sqrdmulh v11.4S, v30.4S, v5.s[0] -sqrdmulh v13.4S, v4.4S, v24.s[0] -sqrdmulh v19.4S, v6.4S, v21.s[0] -sqrdmulh v10.4S, v3.4S, v18.s[0] -mul v30.4S, v30.4S,v16.s[0] -mul v4.4S, v4.4S,v20.s[0] -mul v6.4S, v6.4S,v17.s[0] -mul v3.4S, v3.4S,v1.s[0] -mla v30.4S, v11.4S, v31.s[0] -mla v4.4S, v13.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -mla v3.4S, v10.4S, v31.s[0] -sub v10.4s, v15.4s, v30.4s -sub v19.4s, v25.4s, v4.4s -sub v13.4s, v28.4s, v6.4s -sub v11.4s, v26.4s, v3.4s -add v15.4s, v15.4s, v30.4s -add v25.4s, v25.4s, v4.4s -add v28.4s, v28.4s, v6.4s -add v26.4s, v26.4s, v3.4s -sqrdmulh v3.4S, v15.4S, v5.s[1] -sqrdmulh v6.4S, v25.4S, v24.s[1] -sqrdmulh v4.4S, v28.4S, v21.s[1] -sqrdmulh v30.4S, v26.4S, v18.s[1] -mul v15.4S, v15.4S,v16.s[1] -mul v25.4S, v25.4S,v20.s[1] -mul v28.4S, v28.4S,v17.s[1] -mul v26.4S, v26.4S,v1.s[1] -mla v15.4S, v3.4S, v31.s[0] -mla v25.4S, v6.4S, v31.s[0] -mla v28.4S, v4.4S, v31.s[0] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v29.4s, v15.4s -sub v4.4s, v27.4s, v25.4s -sub v6.4s, v22.4s, v28.4s -sub v3.4s, v12.4s, v26.4s -add v29.4s, v29.4s, v15.4s -add v27.4s, v27.4s, v25.4s -add v22.4s, v22.4s, v28.4s -add v12.4s, v12.4s, v26.4s -sqrdmulh v26.4S, v10.4S, v5.s[2] -sqrdmulh v28.4S, v19.4S, v24.s[2] -sqrdmulh v25.4S, v13.4S, v21.s[2] -sqrdmulh v15.4S, v11.4S, v18.s[2] -str q29, [x0, #512] -str q30, [x0, #528] -mul v10.4S, v10.4S,v16.s[2] -mul v19.4S, v19.4S,v20.s[2] -mul v13.4S, v13.4S,v17.s[2] -mul v11.4S, v11.4S,v1.s[2] -str q27, [x0, #576] -str q4, [x0, #592] -ldr q18, [x17, #+512] -ldr q1, [x17, #+528] -ldr q21, [x17, #+544] -ldr q17, [x17, #+560] -mla v10.4S, v26.4S, v31.s[0] -mla v19.4S, v28.4S, v31.s[0] -mla v13.4S, v25.4S, v31.s[0] -mla v11.4S, v15.4S, v31.s[0] -str q22, [x0, #640] -str q6, [x0, #656] -ldr q6, [x17, #+576] -ldr q22, [x17, #+592] -sub v15.4s, v0.4s, v10.4s -sub v25.4s, v23.4s, v19.4s -sub v28.4s, v2.4s, v13.4s -sub v26.4s, v14.4s, v11.4s -str q12, [x0, #704] -str q3, [x0, #720] -ldr q3, [x17, #+608] -ldr q12, [x17, #+624] -add v0.4s, v0.4s, v10.4s -add v23.4s, v23.4s, v19.4s -add v2.4s, v2.4s, v13.4s -add v14.4s, v14.4s, v11.4s -str q0, [x0, #544] -str q23, [x0, #608] -str q2, [x0, #672] -str q14, [x0, #736] -ldr q14, [x0, #800] -ldr q2, [x0, #816] -ldr q23, [x0, #768] -ldr q0, [x0, #864] -ldr q11, [x0, #880] -ldr q13, [x0, #832] -ldr q19, [x0, #928] -ldr q10, [x0, #944] -ldr q24, [x0, #896] -ldr q20, [x0, #992] -ldr q5, [x0, #1008] -ldr q16, [x0, #960] -sqrdmulh v4.4S, v14.4S, v1.s[0] -sqrdmulh v27.4S, v0.4S, v17.s[0] -sqrdmulh v30.4S, v19.4S, v22.s[0] -sqrdmulh v29.4S, v20.4S, v12.s[0] -str q15, [x0, #560] -mul v14.4S, v14.4S,v18.s[0] -mul v0.4S, v0.4S,v21.s[0] -mul v19.4S, v19.4S,v6.s[0] -mul v20.4S, v20.4S,v3.s[0] -str q25, [x0, #624] -mla v14.4S, v4.4S, v31.s[0] -mla v0.4S, v27.4S, v31.s[0] -mla v19.4S, v30.4S, v31.s[0] -mla v20.4S, v29.4S, v31.s[0] -str q28, [x0, #688] -sub v28.4s, v23.4s, v14.4s -sub v29.4s, v13.4s, v0.4s -sub v30.4s, v24.4s, v19.4s -sub v27.4s, v16.4s, v20.4s -str q26, [x0, #752] -add v23.4s, v23.4s, v14.4s -add v13.4s, v13.4s, v0.4s -add v24.4s, v24.4s, v19.4s -add v16.4s, v16.4s, v20.4s -ldr q20, [x0, #784] -ldr q19, [x0, #848] -ldr q0, [x0, #912] -ldr q14, [x0, #976] -sqrdmulh v26.4S, v2.4S, v1.s[0] -sqrdmulh v4.4S, v11.4S, v17.s[0] -sqrdmulh v25.4S, v10.4S, v22.s[0] -sqrdmulh v15.4S, v5.4S, v12.s[0] -mul v2.4S, v2.4S,v18.s[0] -mul v11.4S, v11.4S,v21.s[0] -mul v10.4S, v10.4S,v6.s[0] -mul v5.4S, v5.4S,v3.s[0] -mla v2.4S, v26.4S, v31.s[0] -mla v11.4S, v4.4S, v31.s[0] -mla v10.4S, v25.4S, v31.s[0] -mla v5.4S, v15.4S, v31.s[0] -sub v15.4s, v20.4s, v2.4s -sub v25.4s, v19.4s, v11.4s -sub v4.4s, v0.4s, v10.4s -sub v26.4s, v14.4s, v5.4s -add v20.4s, v20.4s, v2.4s -add v19.4s, v19.4s, v11.4s -add v0.4s, v0.4s, v10.4s -add v14.4s, v14.4s, v5.4s -sqrdmulh v5.4S, v20.4S, v1.s[1] -sqrdmulh v10.4S, v19.4S, v17.s[1] -sqrdmulh v11.4S, v0.4S, v22.s[1] -sqrdmulh v2.4S, v14.4S, v12.s[1] -mul v20.4S, v20.4S,v18.s[1] -mul v19.4S, v19.4S,v21.s[1] -mul v0.4S, v0.4S,v6.s[1] -mul v14.4S, v14.4S,v3.s[1] -mla v20.4S, v5.4S, v31.s[0] -mla v19.4S, v10.4S, v31.s[0] -mla v0.4S, v11.4S, v31.s[0] -mla v14.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v20.4s -sub v11.4s, v13.4s, v19.4s -sub v10.4s, v24.4s, v0.4s -sub v5.4s, v16.4s, v14.4s -add v23.4s, v23.4s, v20.4s -add v13.4s, v13.4s, v19.4s -add v24.4s, v24.4s, v0.4s -add v16.4s, v16.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v1.s[2] -sqrdmulh v0.4S, v25.4S, v17.s[2] -sqrdmulh v19.4S, v4.4S, v22.s[2] -sqrdmulh v20.4S, v26.4S, v12.s[2] -str q23, [x0, #768] -str q2, [x0, #784] -mul v15.4S, v15.4S,v18.s[2] -mul v25.4S, v25.4S,v21.s[2] -mul v4.4S, v4.4S,v6.s[2] -mul v26.4S, v26.4S,v3.s[2] -str q13, [x0, #832] -str q11, [x0, #848] -mla v15.4S, v14.4S, v31.s[0] -mla v25.4S, v0.4S, v31.s[0] -mla v4.4S, v19.4S, v31.s[0] -mla v26.4S, v20.4S, v31.s[0] -str q24, [x0, #896] -str q10, [x0, #912] -sub v10.4s, v28.4s, v15.4s -sub v24.4s, v29.4s, v25.4s -sub v20.4s, v30.4s, v4.4s -sub v19.4s, v27.4s, v26.4s -str q16, [x0, #960] -str q5, [x0, #976] -add v28.4s, v28.4s, v15.4s -add v29.4s, v29.4s, v25.4s -add v30.4s, v30.4s, v4.4s -add v27.4s, v27.4s, v26.4s -str q28, [x0, #800] -str q29, [x0, #864] -str q30, [x0, #928] -str q27, [x0, #992] -str q10, [x0, #816] -str q24, [x0, #880] -str q20, [x0, #944] -str q19, [x0, #1008] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s deleted file mode 100644 index dbd61ed..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_4.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_4: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x17, #+128] -ldr q1, [x17, #+144] -ldr q10, [x0, #32] -sqrdmulh v21.4S, v10.4S, v1.s[0] -mul v10.4S, v10.4S,v18.s[0] -ldr q22, [x0, #48] -sqrdmulh v15.4S, v22.4S, v1.s[0] -mul v22.4S, v22.4S,v18.s[0] -ldr q3, [x17, #+160] -ldr q12, [x17, #+176] -ldr q4, [x0, #96] -sqrdmulh v30.4S, v4.4S, v12.s[0] -mul v4.4S, v4.4S,v3.s[0] -ldr q29, [x0, #112] -sqrdmulh v28.4S, v29.4S, v12.s[0] -mul v29.4S, v29.4S,v3.s[0] -ldr q27, [x0, #160] -ldr q26, [x17, #+192] -ldr q25, [x17, #+208] -mla v10.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v27.4S, v25.s[0] -ldr q24, [x0, #176] -mla v22.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v24.4S, v25.s[0] -ldr q20, [x0, #224] -ldr q6, [x17, #+224] -ldr q5, [x17, #+240] -mla v4.4S, v30.4S, v31.s[0] -sqrdmulh v30.4S, v20.4S, v5.s[0] -ldr q16, [x0, #240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v16.4S, v5.s[0] -ldr q11, [x0, #128] -ldr q2, [x0, #0] -mul v27.4S, v27.4S,v26.s[0] -sub v23.4s, v2.4s, v10.4s -mul v24.4S, v24.4S,v26.s[0] -add v2.4s, v2.4s, v10.4s -ldr q10, [x0, #144] -ldr q0, [x0, #16] -mla v27.4S, v21.4S, v31.s[0] -sub v21.4s, v0.4s, v22.4s -mla v24.4S, v15.4S, v31.s[0] -add v0.4s, v0.4s, v22.4s -ldr q22, [x0, #192] -ldr q15, [x0, #64] -mul v20.4S, v20.4S,v6.s[0] -sub v13.4s, v15.4s, v4.4s -mul v16.4S, v16.4S,v6.s[0] -add v15.4s, v15.4s, v4.4s -ldr q4, [x0, #208] -ldr q14, [x0, #80] -mla v20.4S, v30.4S, v31.s[0] -sub v30.4s, v14.4s, v29.4s -mla v16.4S, v28.4S, v31.s[0] -add v14.4s, v14.4s, v29.4s -sqrdmulh v29.4S, v0.4S, v1.s[1] -mul v0.4S, v0.4S,v18.s[1] -sqrdmulh v28.4S, v21.4S, v1.s[2] -sub v19.4s, v11.4s, v27.4s -mul v21.4S, v21.4S,v18.s[2] -add v11.4s, v11.4s, v27.4s -sqrdmulh v1.4S, v14.4S, v12.s[1] -sub v18.4s, v10.4s, v24.4s -mul v14.4S, v14.4S,v3.s[1] -add v10.4s, v10.4s, v24.4s -sqrdmulh v24.4S, v30.4S, v12.s[2] -sub v27.4s, v22.4s, v20.4s -mul v30.4S, v30.4S,v3.s[2] -add v22.4s, v22.4s, v20.4s -mla v0.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v16.4s -sqrdmulh v12.4S, v10.4S, v25.s[1] -add v4.4s, v4.4s, v16.4s -mla v21.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v18.4S, v25.s[2] -mla v14.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v4.4S, v5.s[1] -mla v30.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v29.4S, v5.s[2] -mul v10.4S, v10.4S,v26.s[1] -sub v16.4s, v2.4s, v0.4s -mul v18.4S, v18.4S,v26.s[2] -add v2.4s, v2.4s, v0.4s -str q16, [x0, #16] -str q2, [x0, #0] -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v23.4s, v21.4s -mla v18.4S, v28.4S, v31.s[0] -add v23.4s, v23.4s, v21.4s -str q12, [x0, #48] -str q23, [x0, #32] -mul v4.4S, v4.4S,v6.s[1] -sub v25.4s, v15.4s, v14.4s -mul v29.4S, v29.4S,v6.s[2] -add v15.4s, v15.4s, v14.4s -str q25, [x0, #80] -str q15, [x0, #64] -mla v4.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v30.4s -mla v29.4S, v24.4S, v31.s[0] -add v13.4s, v13.4s, v30.4s -str q1, [x0, #112] -str q13, [x0, #96] -ldr q5, [x17, #+256] -ldr q6, [x17, #+272] -ldr q13, [x0, #288] -sqrdmulh v1.4S, v13.4S, v6.s[0] -sub v30.4s, v11.4s, v10.4s -str q30, [x0, #144] -mul v13.4S, v13.4S,v5.s[0] -add v11.4s, v11.4s, v10.4s -str q11, [x0, #128] -ldr q11, [x0, #304] -sqrdmulh v10.4S, v11.4S, v6.s[0] -sub v30.4s, v19.4s, v18.4s -mul v11.4S, v11.4S,v5.s[0] -add v19.4s, v19.4s, v18.4s -str q30, [x0, #176] -str q19, [x0, #160] -ldr q19, [x17, #+288] -ldr q30, [x17, #+304] -ldr q18, [x0, #352] -sqrdmulh v24.4S, v18.4S, v30.s[0] -sub v15.4s, v22.4s, v4.4s -mul v18.4S, v18.4S,v19.s[0] -add v22.4s, v22.4s, v4.4s -str q15, [x0, #208] -str q22, [x0, #192] -ldr q22, [x0, #368] -sqrdmulh v15.4S, v22.4S, v30.s[0] -sub v4.4s, v27.4s, v29.4s -mul v22.4S, v22.4S,v19.s[0] -add v27.4s, v27.4s, v29.4s -str q4, [x0, #240] -str q27, [x0, #224] -ldr q27, [x0, #416] -ldr q4, [x17, #+320] -ldr q29, [x17, #+336] -mla v13.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v27.4S, v29.s[0] -ldr q25, [x0, #432] -mla v11.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v25.4S, v29.s[0] -ldr q14, [x0, #480] -ldr q26, [x17, #+352] -ldr q23, [x17, #+368] -mla v18.4S, v24.4S, v31.s[0] -sqrdmulh v24.4S, v14.4S, v23.s[0] -ldr q12, [x0, #496] -mla v22.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v12.4S, v23.s[0] -ldr q21, [x0, #384] -ldr q28, [x0, #256] -mul v27.4S, v27.4S,v4.s[0] -sub v2.4s, v28.4s, v13.4s -mul v25.4S, v25.4S,v4.s[0] -add v28.4s, v28.4s, v13.4s -ldr q13, [x0, #400] -ldr q16, [x0, #272] -mla v27.4S, v1.4S, v31.s[0] -sub v1.4s, v16.4s, v11.4s -mla v25.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v11.4s -ldr q11, [x0, #448] -ldr q10, [x0, #320] -mul v14.4S, v14.4S,v26.s[0] -sub v0.4s, v10.4s, v18.4s -mul v12.4S, v12.4S,v26.s[0] -add v10.4s, v10.4s, v18.4s -ldr q18, [x0, #464] -ldr q3, [x0, #336] -mla v14.4S, v24.4S, v31.s[0] -sub v24.4s, v3.4s, v22.4s -mla v12.4S, v15.4S, v31.s[0] -add v3.4s, v3.4s, v22.4s -sqrdmulh v22.4S, v16.4S, v6.s[1] -mul v16.4S, v16.4S,v5.s[1] -sqrdmulh v15.4S, v1.4S, v6.s[2] -sub v20.4s, v21.4s, v27.4s -mul v1.4S, v1.4S,v5.s[2] -add v21.4s, v21.4s, v27.4s -sqrdmulh v6.4S, v3.4S, v30.s[1] -sub v5.4s, v13.4s, v25.4s -mul v3.4S, v3.4S,v19.s[1] -add v13.4s, v13.4s, v25.4s -sqrdmulh v25.4S, v24.4S, v30.s[2] -sub v27.4s, v11.4s, v14.4s -mul v24.4S, v24.4S,v19.s[2] -add v11.4s, v11.4s, v14.4s -mla v16.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v12.4s -sqrdmulh v30.4S, v13.4S, v29.s[1] -add v18.4s, v18.4s, v12.4s -mla v1.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v5.4S, v29.s[2] -mla v3.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v18.4S, v23.s[1] -mla v24.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v22.4S, v23.s[2] -mul v13.4S, v13.4S,v4.s[1] -sub v12.4s, v28.4s, v16.4s -mul v5.4S, v5.4S,v4.s[2] -add v28.4s, v28.4s, v16.4s -str q12, [x0, #272] -str q28, [x0, #256] -mla v13.4S, v30.4S, v31.s[0] -sub v30.4s, v2.4s, v1.4s -mla v5.4S, v15.4S, v31.s[0] -add v2.4s, v2.4s, v1.4s -str q30, [x0, #304] -str q2, [x0, #288] -mul v18.4S, v18.4S,v26.s[1] -sub v29.4s, v10.4s, v3.4s -mul v22.4S, v22.4S,v26.s[2] -add v10.4s, v10.4s, v3.4s -str q29, [x0, #336] -str q10, [x0, #320] -mla v18.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v24.4s -mla v22.4S, v25.4S, v31.s[0] -add v0.4s, v0.4s, v24.4s -str q6, [x0, #368] -str q0, [x0, #352] -ldr q23, [x17, #+384] -ldr q26, [x17, #+400] -ldr q0, [x0, #544] -sqrdmulh v6.4S, v0.4S, v26.s[0] -sub v24.4s, v21.4s, v13.4s -str q24, [x0, #400] -mul v0.4S, v0.4S,v23.s[0] -add v21.4s, v21.4s, v13.4s -str q21, [x0, #384] -ldr q21, [x0, #560] -sqrdmulh v13.4S, v21.4S, v26.s[0] -sub v24.4s, v20.4s, v5.4s -mul v21.4S, v21.4S,v23.s[0] -add v20.4s, v20.4s, v5.4s -str q24, [x0, #432] -str q20, [x0, #416] -ldr q20, [x17, #+416] -ldr q24, [x17, #+432] -ldr q5, [x0, #608] -sqrdmulh v25.4S, v5.4S, v24.s[0] -sub v10.4s, v11.4s, v18.4s -mul v5.4S, v5.4S,v20.s[0] -add v11.4s, v11.4s, v18.4s -str q10, [x0, #464] -str q11, [x0, #448] -ldr q11, [x0, #624] -sqrdmulh v10.4S, v11.4S, v24.s[0] -sub v18.4s, v27.4s, v22.4s -mul v11.4S, v11.4S,v20.s[0] -add v27.4s, v27.4s, v22.4s -str q18, [x0, #496] -str q27, [x0, #480] -ldr q27, [x0, #672] -ldr q18, [x17, #+448] -ldr q22, [x17, #+464] -mla v0.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v27.4S, v22.s[0] -ldr q29, [x0, #688] -mla v21.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v29.4S, v22.s[0] -ldr q3, [x0, #736] -ldr q4, [x17, #+480] -ldr q2, [x17, #+496] -mla v5.4S, v25.4S, v31.s[0] -sqrdmulh v25.4S, v3.4S, v2.s[0] -ldr q30, [x0, #752] -mla v11.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v30.4S, v2.s[0] -ldr q1, [x0, #640] -ldr q15, [x0, #512] -mul v27.4S, v27.4S,v18.s[0] -sub v28.4s, v15.4s, v0.4s -mul v29.4S, v29.4S,v18.s[0] -add v15.4s, v15.4s, v0.4s -ldr q0, [x0, #656] -ldr q12, [x0, #528] -mla v27.4S, v6.4S, v31.s[0] -sub v6.4s, v12.4s, v21.4s -mla v29.4S, v13.4S, v31.s[0] -add v12.4s, v12.4s, v21.4s -ldr q21, [x0, #704] -ldr q13, [x0, #576] -mul v3.4S, v3.4S,v4.s[0] -sub v16.4s, v13.4s, v5.4s -mul v30.4S, v30.4S,v4.s[0] -add v13.4s, v13.4s, v5.4s -ldr q5, [x0, #720] -ldr q19, [x0, #592] -mla v3.4S, v25.4S, v31.s[0] -sub v25.4s, v19.4s, v11.4s -mla v30.4S, v10.4S, v31.s[0] -add v19.4s, v19.4s, v11.4s -sqrdmulh v11.4S, v12.4S, v26.s[1] -mul v12.4S, v12.4S,v23.s[1] -sqrdmulh v10.4S, v6.4S, v26.s[2] -sub v14.4s, v1.4s, v27.4s -mul v6.4S, v6.4S,v23.s[2] -add v1.4s, v1.4s, v27.4s -sqrdmulh v26.4S, v19.4S, v24.s[1] -sub v23.4s, v0.4s, v29.4s -mul v19.4S, v19.4S,v20.s[1] -add v0.4s, v0.4s, v29.4s -sqrdmulh v29.4S, v25.4S, v24.s[2] -sub v27.4s, v21.4s, v3.4s -mul v25.4S, v25.4S,v20.s[2] -add v21.4s, v21.4s, v3.4s -mla v12.4S, v11.4S, v31.s[0] -sub v11.4s, v5.4s, v30.4s -sqrdmulh v24.4S, v0.4S, v22.s[1] -add v5.4s, v5.4s, v30.4s -mla v6.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v23.4S, v22.s[2] -mla v19.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v5.4S, v2.s[1] -mla v25.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v11.4S, v2.s[2] -mul v0.4S, v0.4S,v18.s[1] -sub v30.4s, v15.4s, v12.4s -mul v23.4S, v23.4S,v18.s[2] -add v15.4s, v15.4s, v12.4s -str q30, [x0, #528] -str q15, [x0, #512] -mla v0.4S, v24.4S, v31.s[0] -sub v24.4s, v28.4s, v6.4s -mla v23.4S, v10.4S, v31.s[0] -add v28.4s, v28.4s, v6.4s -str q24, [x0, #560] -str q28, [x0, #544] -mul v5.4S, v5.4S,v4.s[1] -sub v22.4s, v13.4s, v19.4s -mul v11.4S, v11.4S,v4.s[2] -add v13.4s, v13.4s, v19.4s -str q22, [x0, #592] -str q13, [x0, #576] -mla v5.4S, v26.4S, v31.s[0] -sub v26.4s, v16.4s, v25.4s -mla v11.4S, v29.4S, v31.s[0] -add v16.4s, v16.4s, v25.4s -str q26, [x0, #624] -str q16, [x0, #608] -ldr q2, [x17, #+512] -ldr q4, [x17, #+528] -ldr q16, [x0, #800] -sqrdmulh v26.4S, v16.4S, v4.s[0] -sub v25.4s, v1.4s, v0.4s -str q25, [x0, #656] -mul v16.4S, v16.4S,v2.s[0] -add v1.4s, v1.4s, v0.4s -str q1, [x0, #640] -ldr q1, [x0, #816] -sqrdmulh v0.4S, v1.4S, v4.s[0] -sub v25.4s, v14.4s, v23.4s -mul v1.4S, v1.4S,v2.s[0] -add v14.4s, v14.4s, v23.4s -str q25, [x0, #688] -str q14, [x0, #672] -ldr q14, [x17, #+544] -ldr q25, [x17, #+560] -ldr q23, [x0, #864] -sqrdmulh v29.4S, v23.4S, v25.s[0] -sub v13.4s, v21.4s, v5.4s -mul v23.4S, v23.4S,v14.s[0] -add v21.4s, v21.4s, v5.4s -str q13, [x0, #720] -str q21, [x0, #704] -ldr q21, [x0, #880] -sqrdmulh v13.4S, v21.4S, v25.s[0] -sub v5.4s, v27.4s, v11.4s -mul v21.4S, v21.4S,v14.s[0] -add v27.4s, v27.4s, v11.4s -str q5, [x0, #752] -str q27, [x0, #736] -ldr q27, [x0, #928] -ldr q5, [x17, #+576] -ldr q11, [x17, #+592] -mla v16.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v27.4S, v11.s[0] -ldr q22, [x0, #944] -mla v1.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v22.4S, v11.s[0] -ldr q19, [x0, #992] -ldr q18, [x17, #+608] -ldr q28, [x17, #+624] -mla v23.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v19.4S, v28.s[0] -ldr q24, [x0, #1008] -mla v21.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v24.4S, v28.s[0] -ldr q6, [x0, #896] -ldr q10, [x0, #768] -mul v27.4S, v27.4S,v5.s[0] -sub v15.4s, v10.4s, v16.4s -mul v22.4S, v22.4S,v5.s[0] -add v10.4s, v10.4s, v16.4s -ldr q16, [x0, #912] -ldr q30, [x0, #784] -mla v27.4S, v26.4S, v31.s[0] -sub v26.4s, v30.4s, v1.4s -mla v22.4S, v0.4S, v31.s[0] -add v30.4s, v30.4s, v1.4s -ldr q1, [x0, #960] -ldr q0, [x0, #832] -mul v19.4S, v19.4S,v18.s[0] -sub v12.4s, v0.4s, v23.4s -mul v24.4S, v24.4S,v18.s[0] -add v0.4s, v0.4s, v23.4s -ldr q23, [x0, #976] -ldr q20, [x0, #848] -mla v19.4S, v29.4S, v31.s[0] -sub v29.4s, v20.4s, v21.4s -mla v24.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v21.4s -sqrdmulh v21.4S, v30.4S, v4.s[1] -mul v30.4S, v30.4S,v2.s[1] -sqrdmulh v13.4S, v26.4S, v4.s[2] -sub v3.4s, v6.4s, v27.4s -mul v26.4S, v26.4S,v2.s[2] -add v6.4s, v6.4s, v27.4s -sqrdmulh v4.4S, v20.4S, v25.s[1] -sub v2.4s, v16.4s, v22.4s -mul v20.4S, v20.4S,v14.s[1] -add v16.4s, v16.4s, v22.4s -sqrdmulh v22.4S, v29.4S, v25.s[2] -sub v27.4s, v1.4s, v19.4s -mul v29.4S, v29.4S,v14.s[2] -add v1.4s, v1.4s, v19.4s -mla v30.4S, v21.4S, v31.s[0] -sub v21.4s, v23.4s, v24.4s -sqrdmulh v25.4S, v16.4S, v11.s[1] -add v23.4s, v23.4s, v24.4s -mla v26.4S, v13.4S, v31.s[0] -sqrdmulh v13.4S, v2.4S, v11.s[2] -mla v20.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v23.4S, v28.s[1] -mla v29.4S, v22.4S, v31.s[0] -sqrdmulh v22.4S, v21.4S, v28.s[2] -mul v16.4S, v16.4S,v5.s[1] -sub v24.4s, v10.4s, v30.4s -mul v2.4S, v2.4S,v5.s[2] -add v10.4s, v10.4s, v30.4s -str q24, [x0, #784] -str q10, [x0, #768] -mla v16.4S, v25.4S, v31.s[0] -sub v25.4s, v15.4s, v26.4s -mla v2.4S, v13.4S, v31.s[0] -add v15.4s, v15.4s, v26.4s -str q25, [x0, #816] -str q15, [x0, #800] -mul v23.4S, v23.4S,v18.s[1] -sub v11.4s, v0.4s, v20.4s -mul v21.4S, v21.4S,v18.s[2] -add v0.4s, v0.4s, v20.4s -str q11, [x0, #848] -str q0, [x0, #832] -mla v23.4S, v4.4S, v31.s[0] -sub v4.4s, v12.4s, v29.4s -mla v21.4S, v22.4S, v31.s[0] -add v12.4s, v12.4s, v29.4s -str q4, [x0, #880] -str q12, [x0, #864] -sub v28.4s, v6.4s, v16.4s -str q28, [x0, #912] -add v6.4s, v6.4s, v16.4s -str q6, [x0, #896] -sub v6.4s, v3.4s, v2.4s -add v3.4s, v3.4s, v2.4s -str q6, [x0, #944] -str q3, [x0, #928] -sub v3.4s, v1.4s, v23.4s -add v1.4s, v1.4s, v23.4s -str q3, [x0, #976] -str q1, [x0, #960] -sub v1.4s, v27.4s, v21.4s -add v27.4s, v27.4s, v21.4s -str q1, [x0, #1008] -str q27, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s deleted file mode 100644 index 2182562..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_5.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_5: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #224] -ldr q1, [x0, #160] -ldr q10, [x0, #32] -ldr q21, [x17, #+128] -ldr q22, [x17, #+144] -sqrdmulh v15.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v21.s[0] -ldr q3, [x0, #48] -sqrdmulh v12.4S, v3.4S, v22.s[0] -mul v3.4S, v3.4S,v21.s[0] -ldr q4, [x17, #+160] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v4.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v4.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v1.4S, v24.s[0] -ldr q20, [x0, #176] -mla v3.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v20.4S, v24.s[0] -ldr q6, [x17, #+224] -ldr q5, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v18.4S, v5.s[0] -ldr q16, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v16.4S, v5.s[0] -ldr q11, [x0, #128] -ldr q2, [x0, #0] -mul v1.4S, v1.4S,v25.s[0] -sub v23.4s, v2.4s, v10.4s -mul v20.4S, v20.4S,v25.s[0] -add v2.4s, v2.4s, v10.4s -ldr q10, [x0, #144] -ldr q0, [x0, #16] -mla v1.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -mla v20.4S, v12.4S, v31.s[0] -add v0.4s, v0.4s, v3.4s -ldr q3, [x0, #192] -ldr q12, [x0, #64] -mul v18.4S, v18.4S,v6.s[0] -sub v13.4s, v12.4s, v29.4s -mul v16.4S, v16.4S,v6.s[0] -add v12.4s, v12.4s, v29.4s -ldr q29, [x0, #208] -ldr q14, [x0, #80] -mla v18.4S, v28.4S, v31.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v27.4s -sqrdmulh v28.4S, v0.4S, v22.s[1] -mul v0.4S, v0.4S,v21.s[1] -add v14.4s, v14.4s, v27.4s -sqrdmulh v27.4S, v15.4S, v22.s[2] -sub v19.4s, v11.4s, v1.4s -mul v15.4S, v15.4S,v21.s[2] -add v11.4s, v11.4s, v1.4s -sqrdmulh v22.4S, v14.4S, v30.s[1] -sub v21.4s, v10.4s, v20.4s -mul v14.4S, v14.4S,v4.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v30.s[2] -sub v1.4s, v3.4s, v18.4s -mul v26.4S, v26.4S,v4.s[2] -add v3.4s, v3.4s, v18.4s -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v16.4s -ldr q30, [x0, #480] -sqrdmulh v4.4S, v10.4S, v24.s[1] -add v29.4s, v29.4s, v16.4s -mla v15.4S, v27.4S, v31.s[0] -ldr q27, [x0, #416] -sqrdmulh v16.4S, v21.4S, v24.s[2] -mla v14.4S, v22.4S, v31.s[0] -ldr q22, [x0, #288] -sqrdmulh v18.4S, v29.4S, v5.s[1] -mla v26.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -sqrdmulh v17.4S, v28.4S, v5.s[2] -ldr q7, [x17, #+272] -mul v10.4S, v10.4S,v25.s[1] -sub v9.4s, v2.4s, v0.4s -str q9, [x0, #16] -mul v21.4S, v21.4S,v25.s[2] -add v2.4s, v2.4s, v0.4s -str q2, [x0, #0] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v23.4s, v15.4s -str q4, [x0, #48] -mla v21.4S, v16.4S, v31.s[0] -add v23.4s, v23.4s, v15.4s -str q23, [x0, #32] -mul v29.4S, v29.4S,v6.s[1] -sub v24.4s, v12.4s, v14.4s -str q24, [x0, #80] -mul v28.4S, v28.4S,v6.s[2] -add v12.4s, v12.4s, v14.4s -str q12, [x0, #64] -mla v29.4S, v18.4S, v31.s[0] -sub v18.4s, v13.4s, v26.4s -str q18, [x0, #112] -mla v28.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v26.4s -str q13, [x0, #96] -sqrdmulh v5.4S, v22.4S, v7.s[0] -sub v6.4s, v11.4s, v10.4s -mul v22.4S, v22.4S,v20.s[0] -str q6, [x0, #144] -ldr q6, [x0, #304] -sqrdmulh v13.4S, v6.4S, v7.s[0] -add v11.4s, v11.4s, v10.4s -mul v6.4S, v6.4S,v20.s[0] -str q11, [x0, #128] -ldr q11, [x17, #+288] -ldr q10, [x17, #+304] -ldr q26, [x0, #352] -sqrdmulh v17.4S, v26.4S, v10.s[0] -sub v18.4s, v19.4s, v21.4s -mul v26.4S, v26.4S,v11.s[0] -str q18, [x0, #176] -ldr q18, [x0, #368] -sqrdmulh v12.4S, v18.4S, v10.s[0] -add v19.4s, v19.4s, v21.4s -mul v18.4S, v18.4S,v11.s[0] -str q19, [x0, #160] -ldr q19, [x17, #+320] -ldr q21, [x17, #+336] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v29.4s -sqrdmulh v14.4S, v27.4S, v21.s[0] -str q5, [x0, #208] -ldr q5, [x0, #432] -mla v6.4S, v13.4S, v31.s[0] -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v5.4S, v21.s[0] -str q3, [x0, #192] -ldr q3, [x17, #+352] -ldr q13, [x17, #+368] -mla v26.4S, v17.4S, v31.s[0] -sub v17.4s, v1.4s, v28.4s -sqrdmulh v24.4S, v30.4S, v13.s[0] -str q17, [x0, #240] -ldr q17, [x0, #496] -mla v18.4S, v12.4S, v31.s[0] -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v17.4S, v13.s[0] -str q1, [x0, #224] -ldr q1, [x0, #384] -ldr q12, [x0, #256] -mul v27.4S, v27.4S,v19.s[0] -sub v25.4s, v12.4s, v22.4s -mul v5.4S, v5.4S,v19.s[0] -add v12.4s, v12.4s, v22.4s -ldr q22, [x0, #400] -ldr q23, [x0, #272] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v23.4s, v6.4s -mla v5.4S, v29.4S, v31.s[0] -add v23.4s, v23.4s, v6.4s -ldr q6, [x0, #448] -ldr q29, [x0, #320] -mul v30.4S, v30.4S,v3.s[0] -sub v15.4s, v29.4s, v26.4s -mul v17.4S, v17.4S,v3.s[0] -add v29.4s, v29.4s, v26.4s -ldr q26, [x0, #464] -ldr q16, [x0, #336] -mla v30.4S, v24.4S, v31.s[0] -mla v17.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v18.4s -sqrdmulh v24.4S, v23.4S, v7.s[1] -mul v23.4S, v23.4S,v20.s[1] -add v16.4s, v16.4s, v18.4s -sqrdmulh v18.4S, v14.4S, v7.s[2] -sub v4.4s, v1.4s, v27.4s -mul v14.4S, v14.4S,v20.s[2] -add v1.4s, v1.4s, v27.4s -sqrdmulh v7.4S, v16.4S, v10.s[1] -sub v20.4s, v22.4s, v5.4s -mul v16.4S, v16.4S,v11.s[1] -add v22.4s, v22.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v10.s[2] -sub v27.4s, v6.4s, v30.4s -mul v28.4S, v28.4S,v11.s[2] -add v6.4s, v6.4s, v30.4s -mla v23.4S, v24.4S, v31.s[0] -sub v24.4s, v26.4s, v17.4s -ldr q10, [x0, #736] -sqrdmulh v11.4S, v22.4S, v21.s[1] -add v26.4s, v26.4s, v17.4s -mla v14.4S, v18.4S, v31.s[0] -ldr q18, [x0, #672] -sqrdmulh v17.4S, v20.4S, v21.s[2] -mla v16.4S, v7.4S, v31.s[0] -ldr q7, [x0, #544] -sqrdmulh v30.4S, v26.4S, v13.s[1] -mla v28.4S, v5.4S, v31.s[0] -ldr q5, [x17, #+384] -sqrdmulh v2.4S, v24.4S, v13.s[2] -ldr q0, [x17, #+400] -mul v22.4S, v22.4S,v19.s[1] -sub v9.4s, v12.4s, v23.4s -str q9, [x0, #272] -mul v20.4S, v20.4S,v19.s[2] -add v12.4s, v12.4s, v23.4s -str q12, [x0, #256] -mla v22.4S, v11.4S, v31.s[0] -sub v11.4s, v25.4s, v14.4s -str q11, [x0, #304] -mla v20.4S, v17.4S, v31.s[0] -add v25.4s, v25.4s, v14.4s -str q25, [x0, #288] -mul v26.4S, v26.4S,v3.s[1] -sub v21.4s, v29.4s, v16.4s -str q21, [x0, #336] -mul v24.4S, v24.4S,v3.s[2] -add v29.4s, v29.4s, v16.4s -str q29, [x0, #320] -mla v26.4S, v30.4S, v31.s[0] -sub v30.4s, v15.4s, v28.4s -str q30, [x0, #368] -mla v24.4S, v2.4S, v31.s[0] -add v15.4s, v15.4s, v28.4s -str q15, [x0, #352] -sqrdmulh v13.4S, v7.4S, v0.s[0] -sub v3.4s, v1.4s, v22.4s -mul v7.4S, v7.4S,v5.s[0] -str q3, [x0, #400] -ldr q3, [x0, #560] -sqrdmulh v15.4S, v3.4S, v0.s[0] -add v1.4s, v1.4s, v22.4s -mul v3.4S, v3.4S,v5.s[0] -str q1, [x0, #384] -ldr q1, [x17, #+416] -ldr q22, [x17, #+432] -ldr q28, [x0, #608] -sqrdmulh v2.4S, v28.4S, v22.s[0] -sub v30.4s, v4.4s, v20.4s -mul v28.4S, v28.4S,v1.s[0] -str q30, [x0, #432] -ldr q30, [x0, #624] -sqrdmulh v29.4S, v30.4S, v22.s[0] -add v4.4s, v4.4s, v20.4s -mul v30.4S, v30.4S,v1.s[0] -str q4, [x0, #416] -ldr q4, [x17, #+448] -ldr q20, [x17, #+464] -mla v7.4S, v13.4S, v31.s[0] -sub v13.4s, v6.4s, v26.4s -sqrdmulh v16.4S, v18.4S, v20.s[0] -str q13, [x0, #464] -ldr q13, [x0, #688] -mla v3.4S, v15.4S, v31.s[0] -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v13.4S, v20.s[0] -str q6, [x0, #448] -ldr q6, [x17, #+480] -ldr q15, [x17, #+496] -mla v28.4S, v2.4S, v31.s[0] -sub v2.4s, v27.4s, v24.4s -sqrdmulh v21.4S, v10.4S, v15.s[0] -str q2, [x0, #496] -ldr q2, [x0, #752] -mla v30.4S, v29.4S, v31.s[0] -add v27.4s, v27.4s, v24.4s -sqrdmulh v24.4S, v2.4S, v15.s[0] -str q27, [x0, #480] -ldr q27, [x0, #640] -ldr q29, [x0, #512] -mul v18.4S, v18.4S,v4.s[0] -sub v19.4s, v29.4s, v7.4s -mul v13.4S, v13.4S,v4.s[0] -add v29.4s, v29.4s, v7.4s -ldr q7, [x0, #656] -ldr q25, [x0, #528] -mla v18.4S, v16.4S, v31.s[0] -sub v16.4s, v25.4s, v3.4s -mla v13.4S, v26.4S, v31.s[0] -add v25.4s, v25.4s, v3.4s -ldr q3, [x0, #704] -ldr q26, [x0, #576] -mul v10.4S, v10.4S,v6.s[0] -sub v14.4s, v26.4s, v28.4s -mul v2.4S, v2.4S,v6.s[0] -add v26.4s, v26.4s, v28.4s -ldr q28, [x0, #720] -ldr q17, [x0, #592] -mla v10.4S, v21.4S, v31.s[0] -mla v2.4S, v24.4S, v31.s[0] -sub v24.4s, v17.4s, v30.4s -sqrdmulh v21.4S, v25.4S, v0.s[1] -mul v25.4S, v25.4S,v5.s[1] -add v17.4s, v17.4s, v30.4s -sqrdmulh v30.4S, v16.4S, v0.s[2] -sub v11.4s, v27.4s, v18.4s -mul v16.4S, v16.4S,v5.s[2] -add v27.4s, v27.4s, v18.4s -sqrdmulh v0.4S, v17.4S, v22.s[1] -sub v5.4s, v7.4s, v13.4s -mul v17.4S, v17.4S,v1.s[1] -add v7.4s, v7.4s, v13.4s -sqrdmulh v13.4S, v24.4S, v22.s[2] -sub v18.4s, v3.4s, v10.4s -mul v24.4S, v24.4S,v1.s[2] -add v3.4s, v3.4s, v10.4s -mla v25.4S, v21.4S, v31.s[0] -sub v21.4s, v28.4s, v2.4s -ldr q22, [x0, #992] -sqrdmulh v1.4S, v7.4S, v20.s[1] -add v28.4s, v28.4s, v2.4s -mla v16.4S, v30.4S, v31.s[0] -ldr q30, [x0, #928] -sqrdmulh v2.4S, v5.4S, v20.s[2] -mla v17.4S, v0.4S, v31.s[0] -ldr q0, [x0, #800] -sqrdmulh v10.4S, v28.4S, v15.s[1] -mla v24.4S, v13.4S, v31.s[0] -ldr q13, [x17, #+512] -sqrdmulh v12.4S, v21.4S, v15.s[2] -ldr q23, [x17, #+528] -mul v7.4S, v7.4S,v4.s[1] -sub v9.4s, v29.4s, v25.4s -str q9, [x0, #528] -mul v5.4S, v5.4S,v4.s[2] -add v29.4s, v29.4s, v25.4s -str q29, [x0, #512] -mla v7.4S, v1.4S, v31.s[0] -sub v1.4s, v19.4s, v16.4s -str q1, [x0, #560] -mla v5.4S, v2.4S, v31.s[0] -add v19.4s, v19.4s, v16.4s -str q19, [x0, #544] -mul v28.4S, v28.4S,v6.s[1] -sub v20.4s, v26.4s, v17.4s -str q20, [x0, #592] -mul v21.4S, v21.4S,v6.s[2] -add v26.4s, v26.4s, v17.4s -str q26, [x0, #576] -mla v28.4S, v10.4S, v31.s[0] -sub v10.4s, v14.4s, v24.4s -str q10, [x0, #624] -mla v21.4S, v12.4S, v31.s[0] -add v14.4s, v14.4s, v24.4s -str q14, [x0, #608] -sqrdmulh v15.4S, v0.4S, v23.s[0] -sub v6.4s, v27.4s, v7.4s -mul v0.4S, v0.4S,v13.s[0] -str q6, [x0, #656] -ldr q6, [x0, #816] -sqrdmulh v14.4S, v6.4S, v23.s[0] -add v27.4s, v27.4s, v7.4s -mul v6.4S, v6.4S,v13.s[0] -str q27, [x0, #640] -ldr q27, [x17, #+544] -ldr q7, [x17, #+560] -ldr q24, [x0, #864] -sqrdmulh v12.4S, v24.4S, v7.s[0] -sub v10.4s, v11.4s, v5.4s -mul v24.4S, v24.4S,v27.s[0] -str q10, [x0, #688] -ldr q10, [x0, #880] -sqrdmulh v26.4S, v10.4S, v7.s[0] -add v11.4s, v11.4s, v5.4s -mul v10.4S, v10.4S,v27.s[0] -str q11, [x0, #672] -ldr q11, [x17, #+576] -ldr q5, [x17, #+592] -mla v0.4S, v15.4S, v31.s[0] -sub v15.4s, v3.4s, v28.4s -sqrdmulh v17.4S, v30.4S, v5.s[0] -str q15, [x0, #720] -ldr q15, [x0, #944] -mla v6.4S, v14.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -sqrdmulh v28.4S, v15.4S, v5.s[0] -str q3, [x0, #704] -ldr q3, [x17, #+608] -ldr q14, [x17, #+624] -mla v24.4S, v12.4S, v31.s[0] -sub v12.4s, v18.4s, v21.4s -sqrdmulh v20.4S, v22.4S, v14.s[0] -str q12, [x0, #752] -ldr q12, [x0, #1008] -mla v10.4S, v26.4S, v31.s[0] -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v12.4S, v14.s[0] -str q18, [x0, #736] -ldr q18, [x0, #896] -ldr q26, [x0, #768] -mul v30.4S, v30.4S,v11.s[0] -sub v4.4s, v26.4s, v0.4s -mul v15.4S, v15.4S,v11.s[0] -add v26.4s, v26.4s, v0.4s -ldr q0, [x0, #912] -ldr q19, [x0, #784] -mla v30.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v6.4s -mla v15.4S, v28.4S, v31.s[0] -add v19.4s, v19.4s, v6.4s -ldr q6, [x0, #960] -ldr q28, [x0, #832] -mul v22.4S, v22.4S,v3.s[0] -sub v16.4s, v28.4s, v24.4s -mul v12.4S, v12.4S,v3.s[0] -add v28.4s, v28.4s, v24.4s -ldr q24, [x0, #976] -ldr q2, [x0, #848] -mla v22.4S, v20.4S, v31.s[0] -mla v12.4S, v21.4S, v31.s[0] -sub v21.4s, v2.4s, v10.4s -sqrdmulh v20.4S, v19.4S, v23.s[1] -mul v19.4S, v19.4S,v13.s[1] -add v2.4s, v2.4s, v10.4s -sqrdmulh v10.4S, v17.4S, v23.s[2] -sub v1.4s, v18.4s, v30.4s -mul v17.4S, v17.4S,v13.s[2] -add v18.4s, v18.4s, v30.4s -sqrdmulh v23.4S, v2.4S, v7.s[1] -sub v13.4s, v0.4s, v15.4s -mul v2.4S, v2.4S,v27.s[1] -add v0.4s, v0.4s, v15.4s -sqrdmulh v15.4S, v21.4S, v7.s[2] -sub v30.4s, v6.4s, v22.4s -mul v21.4S, v21.4S,v27.s[2] -add v6.4s, v6.4s, v22.4s -mla v19.4S, v20.4S, v31.s[0] -sub v20.4s, v24.4s, v12.4s -sqrdmulh v7.4S, v0.4S, v5.s[1] -add v24.4s, v24.4s, v12.4s -mla v17.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v13.4S, v5.s[2] -mla v2.4S, v23.4S, v31.s[0] -sqrdmulh v23.4S, v24.4S, v14.s[1] -mla v21.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v20.4S, v14.s[2] -mul v0.4S, v0.4S,v11.s[1] -sub v12.4s, v26.4s, v19.4s -str q12, [x0, #784] -mul v13.4S, v13.4S,v11.s[2] -add v26.4s, v26.4s, v19.4s -str q26, [x0, #768] -mla v0.4S, v7.4S, v31.s[0] -sub v7.4s, v4.4s, v17.4s -str q7, [x0, #816] -mla v13.4S, v10.4S, v31.s[0] -add v4.4s, v4.4s, v17.4s -str q4, [x0, #800] -mul v24.4S, v24.4S,v3.s[1] -sub v5.4s, v28.4s, v2.4s -str q5, [x0, #848] -mul v20.4S, v20.4S,v3.s[2] -add v28.4s, v28.4s, v2.4s -str q28, [x0, #832] -mla v24.4S, v23.4S, v31.s[0] -sub v23.4s, v16.4s, v21.4s -str q23, [x0, #880] -mla v20.4S, v15.4S, v31.s[0] -add v16.4s, v16.4s, v21.4s -str q16, [x0, #864] -sub v14.4s, v18.4s, v0.4s -str q14, [x0, #912] -add v18.4s, v18.4s, v0.4s -str q18, [x0, #896] -sub v18.4s, v1.4s, v13.4s -str q18, [x0, #944] -add v1.4s, v1.4s, v13.4s -str q1, [x0, #928] -sub v1.4s, v6.4s, v24.4s -str q1, [x0, #976] -add v6.4s, v6.4s, v24.4s -str q6, [x0, #960] -sub v6.4s, v30.4s, v20.4s -str q6, [x0, #1008] -add v30.4s, v30.4s, v20.4s -str q30, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s deleted file mode 100644 index d7c7c98..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_6.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_6: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #224] -ldr q1, [x0, #160] -ldr q10, [x0, #32] -ldr q21, [x17, #+128] -ldr q22, [x17, #+144] -sqrdmulh v15.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v21.s[0] -ldr q3, [x0, #48] -sqrdmulh v12.4S, v3.4S, v22.s[0] -mul v3.4S, v3.4S,v21.s[0] -ldr q4, [x17, #+160] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v4.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v4.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v1.4S, v24.s[0] -ldr q20, [x0, #176] -mla v3.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v20.4S, v24.s[0] -ldr q6, [x17, #+224] -ldr q5, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v18.4S, v5.s[0] -ldr q16, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v16.4S, v5.s[0] -ldr q11, [x0, #0] -ldr q2, [x0, #128] -mul v1.4S, v1.4S,v25.s[0] -sub v23.4s, v11.4s, v10.4s -ldr q0, [x0, #16] -mul v20.4S, v20.4S,v25.s[0] -add v11.4s, v11.4s, v10.4s -ldr q10, [x0, #144] -mla v1.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -ldr q13, [x0, #64] -mla v20.4S, v12.4S, v31.s[0] -add v0.4s, v0.4s, v3.4s -ldr q3, [x0, #192] -mul v18.4S, v18.4S,v6.s[0] -sub v12.4s, v13.4s, v29.4s -ldr q14, [x0, #80] -mul v16.4S, v16.4S,v6.s[0] -add v13.4s, v13.4s, v29.4s -ldr q29, [x0, #208] -mla v18.4S, v28.4S, v31.s[0] -sub v28.4s, v14.4s, v27.4s -mla v16.4S, v26.4S, v31.s[0] -add v14.4s, v14.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v22.s[1] -mul v0.4S, v0.4S,v21.s[1] -sqrdmulh v26.4S, v15.4S, v22.s[2] -sub v19.4s, v2.4s, v1.4s -mul v15.4S, v15.4S,v21.s[2] -add v2.4s, v2.4s, v1.4s -sqrdmulh v22.4S, v14.4S, v30.s[1] -sub v21.4s, v10.4s, v20.4s -mul v14.4S, v14.4S,v4.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v28.4S, v30.s[2] -sub v1.4s, v3.4s, v18.4s -mul v28.4S, v28.4S,v4.s[2] -add v3.4s, v3.4s, v18.4s -mla v0.4S, v27.4S, v31.s[0] -sub v27.4s, v29.4s, v16.4s -ldr q30, [x0, #480] -sqrdmulh v4.4S, v10.4S, v24.s[1] -add v29.4s, v29.4s, v16.4s -mla v15.4S, v26.4S, v31.s[0] -ldr q26, [x0, #416] -sqrdmulh v16.4S, v21.4S, v24.s[2] -sub v18.4s, v11.4s, v0.4s -mla v14.4S, v22.4S, v31.s[0] -ldr q22, [x0, #288] -sqrdmulh v17.4S, v29.4S, v5.s[1] -add v11.4s, v11.4s, v0.4s -str q18, [x0, #16] -mla v28.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -ldr q18, [x17, #+272] -sqrdmulh v0.4S, v27.4S, v5.s[2] -sub v7.4s, v23.4s, v15.4s -str q11, [x0, #0] -mul v10.4S, v10.4S,v25.s[1] -add v23.4s, v23.4s, v15.4s -mul v21.4S, v21.4S,v25.s[2] -str q7, [x0, #48] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v14.4s -mla v21.4S, v16.4S, v31.s[0] -str q23, [x0, #32] -mul v29.4S, v29.4S,v6.s[1] -str q4, [x0, #80] -mul v27.4S, v27.4S,v6.s[2] -add v13.4s, v13.4s, v14.4s -str q13, [x0, #64] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v28.4s -str q17, [x0, #112] -mla v27.4S, v0.4S, v31.s[0] -add v12.4s, v12.4s, v28.4s -str q12, [x0, #96] -sqrdmulh v5.4S, v22.4S, v18.s[0] -sub v6.4s, v2.4s, v10.4s -mul v22.4S, v22.4S,v20.s[0] -str q6, [x0, #144] -ldr q6, [x0, #304] -sqrdmulh v12.4S, v6.4S, v18.s[0] -add v2.4s, v2.4s, v10.4s -mul v6.4S, v6.4S,v20.s[0] -str q2, [x0, #128] -ldr q2, [x17, #+288] -ldr q10, [x17, #+304] -ldr q28, [x0, #352] -sqrdmulh v0.4S, v28.4S, v10.s[0] -sub v17.4s, v19.4s, v21.4s -mul v28.4S, v28.4S,v2.s[0] -str q17, [x0, #176] -ldr q17, [x0, #368] -sqrdmulh v13.4S, v17.4S, v10.s[0] -add v19.4s, v19.4s, v21.4s -mul v17.4S, v17.4S,v2.s[0] -str q19, [x0, #160] -ldr q19, [x17, #+320] -ldr q21, [x17, #+336] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v29.4s -sqrdmulh v14.4S, v26.4S, v21.s[0] -str q5, [x0, #208] -ldr q5, [x0, #432] -mla v6.4S, v12.4S, v31.s[0] -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v5.4S, v21.s[0] -str q3, [x0, #192] -ldr q3, [x17, #+352] -ldr q12, [x17, #+368] -mla v28.4S, v0.4S, v31.s[0] -sub v0.4s, v1.4s, v27.4s -sqrdmulh v4.4S, v30.4S, v12.s[0] -str q0, [x0, #240] -ldr q0, [x0, #496] -mla v17.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v27.4s -sqrdmulh v27.4S, v0.4S, v12.s[0] -str q1, [x0, #224] -ldr q1, [x0, #256] -ldr q13, [x0, #384] -mul v26.4S, v26.4S,v19.s[0] -sub v24.4s, v1.4s, v22.4s -ldr q25, [x0, #272] -mul v5.4S, v5.4S,v19.s[0] -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #400] -mla v26.4S, v14.4S, v31.s[0] -sub v14.4s, v25.4s, v6.4s -ldr q23, [x0, #320] -mla v5.4S, v29.4S, v31.s[0] -add v25.4s, v25.4s, v6.4s -ldr q6, [x0, #448] -mul v30.4S, v30.4S,v3.s[0] -sub v29.4s, v23.4s, v28.4s -ldr q16, [x0, #336] -mul v0.4S, v0.4S,v3.s[0] -add v23.4s, v23.4s, v28.4s -ldr q28, [x0, #464] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v16.4s, v17.4s -mla v0.4S, v27.4S, v31.s[0] -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v18.s[1] -mul v25.4S, v25.4S,v20.s[1] -sqrdmulh v27.4S, v14.4S, v18.s[2] -sub v7.4s, v13.4s, v26.4s -mul v14.4S, v14.4S,v20.s[2] -add v13.4s, v13.4s, v26.4s -sqrdmulh v18.4S, v16.4S, v10.s[1] -sub v20.4s, v22.4s, v5.4s -mul v16.4S, v16.4S,v2.s[1] -add v22.4s, v22.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v10.s[2] -sub v26.4s, v6.4s, v30.4s -mul v4.4S, v4.4S,v2.s[2] -add v6.4s, v6.4s, v30.4s -mla v25.4S, v17.4S, v31.s[0] -sub v17.4s, v28.4s, v0.4s -ldr q10, [x0, #736] -sqrdmulh v2.4S, v22.4S, v21.s[1] -add v28.4s, v28.4s, v0.4s -mla v14.4S, v27.4S, v31.s[0] -ldr q27, [x0, #672] -sqrdmulh v0.4S, v20.4S, v21.s[2] -sub v30.4s, v1.4s, v25.4s -mla v16.4S, v18.4S, v31.s[0] -ldr q18, [x0, #544] -sqrdmulh v15.4S, v28.4S, v12.s[1] -add v1.4s, v1.4s, v25.4s -str q30, [x0, #272] -mla v4.4S, v5.4S, v31.s[0] -ldr q5, [x17, #+384] -ldr q30, [x17, #+400] -sqrdmulh v25.4S, v17.4S, v12.s[2] -sub v11.4s, v24.4s, v14.4s -str q1, [x0, #256] -mul v22.4S, v22.4S,v19.s[1] -add v24.4s, v24.4s, v14.4s -mul v20.4S, v20.4S,v19.s[2] -str q11, [x0, #304] -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v16.4s -mla v20.4S, v0.4S, v31.s[0] -str q24, [x0, #288] -mul v28.4S, v28.4S,v3.s[1] -str q2, [x0, #336] -mul v17.4S, v17.4S,v3.s[2] -add v23.4s, v23.4s, v16.4s -str q23, [x0, #320] -mla v28.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v4.4s -str q15, [x0, #368] -mla v17.4S, v25.4S, v31.s[0] -add v29.4s, v29.4s, v4.4s -str q29, [x0, #352] -sqrdmulh v12.4S, v18.4S, v30.s[0] -sub v3.4s, v13.4s, v22.4s -mul v18.4S, v18.4S,v5.s[0] -str q3, [x0, #400] -ldr q3, [x0, #560] -sqrdmulh v29.4S, v3.4S, v30.s[0] -add v13.4s, v13.4s, v22.4s -mul v3.4S, v3.4S,v5.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+416] -ldr q22, [x17, #+432] -ldr q4, [x0, #608] -sqrdmulh v25.4S, v4.4S, v22.s[0] -sub v15.4s, v7.4s, v20.4s -mul v4.4S, v4.4S,v13.s[0] -str q15, [x0, #432] -ldr q15, [x0, #624] -sqrdmulh v23.4S, v15.4S, v22.s[0] -add v7.4s, v7.4s, v20.4s -mul v15.4S, v15.4S,v13.s[0] -str q7, [x0, #416] -ldr q7, [x17, #+448] -ldr q20, [x17, #+464] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v28.4s -sqrdmulh v16.4S, v27.4S, v20.s[0] -str q12, [x0, #464] -ldr q12, [x0, #688] -mla v3.4S, v29.4S, v31.s[0] -add v6.4s, v6.4s, v28.4s -sqrdmulh v28.4S, v12.4S, v20.s[0] -str q6, [x0, #448] -ldr q6, [x17, #+480] -ldr q29, [x17, #+496] -mla v4.4S, v25.4S, v31.s[0] -sub v25.4s, v26.4s, v17.4s -sqrdmulh v2.4S, v10.4S, v29.s[0] -str q25, [x0, #496] -ldr q25, [x0, #752] -mla v15.4S, v23.4S, v31.s[0] -add v26.4s, v26.4s, v17.4s -sqrdmulh v17.4S, v25.4S, v29.s[0] -str q26, [x0, #480] -ldr q26, [x0, #512] -ldr q23, [x0, #640] -mul v27.4S, v27.4S,v7.s[0] -sub v21.4s, v26.4s, v18.4s -ldr q19, [x0, #528] -mul v12.4S, v12.4S,v7.s[0] -add v26.4s, v26.4s, v18.4s -ldr q18, [x0, #656] -mla v27.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v3.4s -ldr q24, [x0, #576] -mla v12.4S, v28.4S, v31.s[0] -add v19.4s, v19.4s, v3.4s -ldr q3, [x0, #704] -mul v10.4S, v10.4S,v6.s[0] -sub v28.4s, v24.4s, v4.4s -ldr q0, [x0, #592] -mul v25.4S, v25.4S,v6.s[0] -add v24.4s, v24.4s, v4.4s -ldr q4, [x0, #720] -mla v10.4S, v2.4S, v31.s[0] -sub v2.4s, v0.4s, v15.4s -mla v25.4S, v17.4S, v31.s[0] -add v0.4s, v0.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v30.s[1] -mul v19.4S, v19.4S,v5.s[1] -sqrdmulh v17.4S, v16.4S, v30.s[2] -sub v11.4s, v23.4s, v27.4s -mul v16.4S, v16.4S,v5.s[2] -add v23.4s, v23.4s, v27.4s -sqrdmulh v30.4S, v0.4S, v22.s[1] -sub v5.4s, v18.4s, v12.4s -mul v0.4S, v0.4S,v13.s[1] -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v2.4S, v22.s[2] -sub v27.4s, v3.4s, v10.4s -mul v2.4S, v2.4S,v13.s[2] -add v3.4s, v3.4s, v10.4s -mla v19.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v25.4s -ldr q22, [x0, #992] -sqrdmulh v13.4S, v18.4S, v20.s[1] -add v4.4s, v4.4s, v25.4s -mla v16.4S, v17.4S, v31.s[0] -ldr q17, [x0, #928] -sqrdmulh v25.4S, v5.4S, v20.s[2] -sub v10.4s, v26.4s, v19.4s -mla v0.4S, v30.4S, v31.s[0] -ldr q30, [x0, #800] -sqrdmulh v14.4S, v4.4S, v29.s[1] -add v26.4s, v26.4s, v19.4s -str q10, [x0, #528] -mla v2.4S, v12.4S, v31.s[0] -ldr q12, [x17, #+512] -ldr q10, [x17, #+528] -sqrdmulh v19.4S, v15.4S, v29.s[2] -sub v1.4s, v21.4s, v16.4s -str q26, [x0, #512] -mul v18.4S, v18.4S,v7.s[1] -add v21.4s, v21.4s, v16.4s -mul v5.4S, v5.4S,v7.s[2] -str q1, [x0, #560] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v24.4s, v0.4s -mla v5.4S, v25.4S, v31.s[0] -str q21, [x0, #544] -mul v4.4S, v4.4S,v6.s[1] -str q13, [x0, #592] -mul v15.4S, v15.4S,v6.s[2] -add v24.4s, v24.4s, v0.4s -str q24, [x0, #576] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v28.4s, v2.4s -str q14, [x0, #624] -mla v15.4S, v19.4S, v31.s[0] -add v28.4s, v28.4s, v2.4s -str q28, [x0, #608] -sqrdmulh v29.4S, v30.4S, v10.s[0] -sub v6.4s, v23.4s, v18.4s -mul v30.4S, v30.4S,v12.s[0] -str q6, [x0, #656] -ldr q6, [x0, #816] -sqrdmulh v28.4S, v6.4S, v10.s[0] -add v23.4s, v23.4s, v18.4s -mul v6.4S, v6.4S,v12.s[0] -str q23, [x0, #640] -ldr q23, [x17, #+544] -ldr q18, [x17, #+560] -ldr q2, [x0, #864] -sqrdmulh v19.4S, v2.4S, v18.s[0] -sub v14.4s, v11.4s, v5.4s -mul v2.4S, v2.4S,v23.s[0] -str q14, [x0, #688] -ldr q14, [x0, #880] -sqrdmulh v24.4S, v14.4S, v18.s[0] -add v11.4s, v11.4s, v5.4s -mul v14.4S, v14.4S,v23.s[0] -str q11, [x0, #672] -ldr q11, [x17, #+576] -ldr q5, [x17, #+592] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v4.4s -sqrdmulh v0.4S, v17.4S, v5.s[0] -str q29, [x0, #720] -ldr q29, [x0, #944] -mla v6.4S, v28.4S, v31.s[0] -add v3.4s, v3.4s, v4.4s -sqrdmulh v4.4S, v29.4S, v5.s[0] -str q3, [x0, #704] -ldr q3, [x17, #+608] -ldr q28, [x17, #+624] -mla v2.4S, v19.4S, v31.s[0] -sub v19.4s, v27.4s, v15.4s -sqrdmulh v13.4S, v22.4S, v28.s[0] -str q19, [x0, #752] -ldr q19, [x0, #1008] -mla v14.4S, v24.4S, v31.s[0] -add v27.4s, v27.4s, v15.4s -sqrdmulh v15.4S, v19.4S, v28.s[0] -str q27, [x0, #736] -ldr q27, [x0, #768] -ldr q24, [x0, #896] -mul v17.4S, v17.4S,v11.s[0] -sub v20.4s, v27.4s, v30.4s -ldr q7, [x0, #784] -mul v29.4S, v29.4S,v11.s[0] -add v27.4s, v27.4s, v30.4s -ldr q30, [x0, #912] -mla v17.4S, v0.4S, v31.s[0] -sub v0.4s, v7.4s, v6.4s -ldr q21, [x0, #832] -mla v29.4S, v4.4S, v31.s[0] -add v7.4s, v7.4s, v6.4s -ldr q6, [x0, #960] -mul v22.4S, v22.4S,v3.s[0] -sub v4.4s, v21.4s, v2.4s -ldr q25, [x0, #848] -mul v19.4S, v19.4S,v3.s[0] -add v21.4s, v21.4s, v2.4s -ldr q2, [x0, #976] -mla v22.4S, v13.4S, v31.s[0] -sub v13.4s, v25.4s, v14.4s -mla v19.4S, v15.4S, v31.s[0] -add v25.4s, v25.4s, v14.4s -sqrdmulh v14.4S, v7.4S, v10.s[1] -mul v7.4S, v7.4S,v12.s[1] -sqrdmulh v15.4S, v0.4S, v10.s[2] -sub v1.4s, v24.4s, v17.4s -mul v0.4S, v0.4S,v12.s[2] -add v24.4s, v24.4s, v17.4s -sqrdmulh v10.4S, v25.4S, v18.s[1] -sub v12.4s, v30.4s, v29.4s -mul v25.4S, v25.4S,v23.s[1] -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v18.s[2] -sub v17.4s, v6.4s, v22.4s -mul v13.4S, v13.4S,v23.s[2] -add v6.4s, v6.4s, v22.4s -mla v7.4S, v14.4S, v31.s[0] -sub v14.4s, v2.4s, v19.4s -sqrdmulh v18.4S, v30.4S, v5.s[1] -add v2.4s, v2.4s, v19.4s -mla v0.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v12.4S, v5.s[2] -sub v19.4s, v27.4s, v7.4s -mla v25.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v2.4S, v28.s[1] -add v27.4s, v27.4s, v7.4s -str q19, [x0, #784] -mla v13.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v14.4S, v28.s[2] -sub v19.4s, v20.4s, v0.4s -str q27, [x0, #768] -mul v30.4S, v30.4S,v11.s[1] -add v20.4s, v20.4s, v0.4s -mul v12.4S, v12.4S,v11.s[2] -str q19, [x0, #816] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v25.4s -mla v12.4S, v15.4S, v31.s[0] -str q20, [x0, #800] -mul v2.4S, v2.4S,v3.s[1] -str q18, [x0, #848] -mul v14.4S, v14.4S,v3.s[2] -add v21.4s, v21.4s, v25.4s -str q21, [x0, #832] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v4.4s, v13.4s -str q10, [x0, #880] -mla v14.4S, v29.4S, v31.s[0] -add v4.4s, v4.4s, v13.4s -str q4, [x0, #864] -sub v28.4s, v24.4s, v30.4s -str q28, [x0, #912] -add v24.4s, v24.4s, v30.4s -str q24, [x0, #896] -sub v24.4s, v1.4s, v12.4s -str q24, [x0, #944] -add v1.4s, v1.4s, v12.4s -str q1, [x0, #928] -sub v1.4s, v6.4s, v2.4s -str q1, [x0, #976] -add v6.4s, v6.4s, v2.4s -str q6, [x0, #960] -sub v6.4s, v17.4s, v14.4s -str q6, [x0, #1008] -add v17.4s, v17.4s, v14.4s -str q17, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s deleted file mode 100644 index a96a052..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_7.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #224] -ldr q1, [x0, #160] -ldr q10, [x0, #32] -ldr q21, [x17, #+128] -ldr q22, [x17, #+144] -sqrdmulh v15.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v21.s[0] -ldr q3, [x0, #48] -sqrdmulh v12.4S, v3.4S, v22.s[0] -mul v3.4S, v3.4S,v21.s[0] -ldr q4, [x17, #+160] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v4.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v4.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v1.4S, v24.s[0] -ldr q20, [x0, #176] -mla v3.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v20.4S, v24.s[0] -ldr q6, [x17, #+224] -ldr q5, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v18.4S, v5.s[0] -ldr q16, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v16.4S, v5.s[0] -ldr q11, [x0, #0] -ldr q2, [x0, #128] -mul v1.4S, v1.4S,v25.s[0] -sub v23.4s, v11.4s, v10.4s -ldr q0, [x0, #16] -mul v20.4S, v20.4S,v25.s[0] -add v11.4s, v11.4s, v10.4s -ldr q10, [x0, #144] -mla v1.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -ldr q13, [x0, #64] -mla v20.4S, v12.4S, v31.s[0] -add v0.4s, v0.4s, v3.4s -ldr q3, [x0, #192] -mul v18.4S, v18.4S,v6.s[0] -sub v12.4s, v13.4s, v29.4s -ldr q14, [x0, #80] -mul v16.4S, v16.4S,v6.s[0] -add v13.4s, v13.4s, v29.4s -ldr q29, [x0, #208] -mla v18.4S, v28.4S, v31.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v27.4s -sqrdmulh v28.4S, v0.4S, v22.s[1] -add v14.4s, v14.4s, v27.4s -mul v0.4S, v0.4S,v21.s[1] -sqrdmulh v27.4S, v15.4S, v22.s[2] -sub v19.4s, v2.4s, v1.4s -mul v15.4S, v15.4S,v21.s[2] -add v2.4s, v2.4s, v1.4s -sqrdmulh v22.4S, v14.4S, v30.s[1] -sub v21.4s, v10.4s, v20.4s -mul v14.4S, v14.4S,v4.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v30.s[2] -sub v1.4s, v3.4s, v18.4s -mul v26.4S, v26.4S,v4.s[2] -add v3.4s, v3.4s, v18.4s -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v16.4s -ldr q30, [x0, #480] -sqrdmulh v4.4S, v10.4S, v24.s[1] -add v29.4s, v29.4s, v16.4s -mla v15.4S, v27.4S, v31.s[0] -ldr q27, [x0, #416] -sqrdmulh v16.4S, v21.4S, v24.s[2] -sub v18.4s, v11.4s, v0.4s -mla v14.4S, v22.4S, v31.s[0] -ldr q22, [x0, #288] -sqrdmulh v17.4S, v29.4S, v5.s[1] -add v11.4s, v11.4s, v0.4s -str q18, [x0, #16] -mla v26.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -ldr q18, [x17, #+272] -sqrdmulh v0.4S, v28.4S, v5.s[2] -sub v7.4s, v23.4s, v15.4s -str q11, [x0, #0] -mul v10.4S, v10.4S,v25.s[1] -add v23.4s, v23.4s, v15.4s -mul v21.4S, v21.4S,v25.s[2] -str q7, [x0, #48] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v14.4s -mla v21.4S, v16.4S, v31.s[0] -str q23, [x0, #32] -mul v29.4S, v29.4S,v6.s[1] -str q4, [x0, #80] -mul v28.4S, v28.4S,v6.s[2] -add v13.4s, v13.4s, v14.4s -str q13, [x0, #64] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v26.4s -str q17, [x0, #112] -mla v28.4S, v0.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -str q12, [x0, #96] -sqrdmulh v5.4S, v22.4S, v18.s[0] -sub v6.4s, v2.4s, v10.4s -mul v22.4S, v22.4S,v20.s[0] -str q6, [x0, #144] -ldr q6, [x0, #304] -sqrdmulh v12.4S, v6.4S, v18.s[0] -add v2.4s, v2.4s, v10.4s -mul v6.4S, v6.4S,v20.s[0] -str q2, [x0, #128] -ldr q2, [x17, #+288] -ldr q10, [x17, #+304] -ldr q26, [x0, #352] -sqrdmulh v0.4S, v26.4S, v10.s[0] -sub v17.4s, v19.4s, v21.4s -mul v26.4S, v26.4S,v2.s[0] -str q17, [x0, #176] -ldr q17, [x0, #368] -sqrdmulh v13.4S, v17.4S, v10.s[0] -add v19.4s, v19.4s, v21.4s -mul v17.4S, v17.4S,v2.s[0] -str q19, [x0, #160] -ldr q19, [x17, #+320] -ldr q21, [x17, #+336] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v29.4s -sqrdmulh v14.4S, v27.4S, v21.s[0] -str q5, [x0, #208] -ldr q5, [x0, #432] -mla v6.4S, v12.4S, v31.s[0] -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v5.4S, v21.s[0] -str q3, [x0, #192] -ldr q3, [x17, #+352] -ldr q12, [x17, #+368] -mla v26.4S, v0.4S, v31.s[0] -sub v0.4s, v1.4s, v28.4s -sqrdmulh v4.4S, v30.4S, v12.s[0] -str q0, [x0, #240] -ldr q0, [x0, #496] -mla v17.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v0.4S, v12.s[0] -str q1, [x0, #224] -ldr q1, [x0, #256] -ldr q13, [x0, #384] -mul v27.4S, v27.4S,v19.s[0] -sub v24.4s, v1.4s, v22.4s -ldr q25, [x0, #272] -mul v5.4S, v5.4S,v19.s[0] -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #400] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v25.4s, v6.4s -ldr q23, [x0, #320] -mla v5.4S, v29.4S, v31.s[0] -add v25.4s, v25.4s, v6.4s -ldr q6, [x0, #448] -mul v30.4S, v30.4S,v3.s[0] -sub v29.4s, v23.4s, v26.4s -ldr q16, [x0, #336] -mul v0.4S, v0.4S,v3.s[0] -add v23.4s, v23.4s, v26.4s -ldr q26, [x0, #464] -mla v30.4S, v4.4S, v31.s[0] -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v17.4s -sqrdmulh v4.4S, v25.4S, v18.s[1] -add v16.4s, v16.4s, v17.4s -mul v25.4S, v25.4S,v20.s[1] -sqrdmulh v17.4S, v14.4S, v18.s[2] -sub v7.4s, v13.4s, v27.4s -mul v14.4S, v14.4S,v20.s[2] -add v13.4s, v13.4s, v27.4s -sqrdmulh v18.4S, v16.4S, v10.s[1] -sub v20.4s, v22.4s, v5.4s -mul v16.4S, v16.4S,v2.s[1] -add v22.4s, v22.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v10.s[2] -sub v27.4s, v6.4s, v30.4s -mul v28.4S, v28.4S,v2.s[2] -add v6.4s, v6.4s, v30.4s -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v0.4s -ldr q10, [x0, #736] -sqrdmulh v2.4S, v22.4S, v21.s[1] -add v26.4s, v26.4s, v0.4s -mla v14.4S, v17.4S, v31.s[0] -ldr q17, [x0, #672] -sqrdmulh v0.4S, v20.4S, v21.s[2] -sub v30.4s, v1.4s, v25.4s -mla v16.4S, v18.4S, v31.s[0] -ldr q18, [x0, #544] -sqrdmulh v15.4S, v26.4S, v12.s[1] -add v1.4s, v1.4s, v25.4s -str q30, [x0, #272] -mla v28.4S, v5.4S, v31.s[0] -ldr q5, [x17, #+384] -ldr q30, [x17, #+400] -sqrdmulh v25.4S, v4.4S, v12.s[2] -sub v11.4s, v24.4s, v14.4s -str q1, [x0, #256] -mul v22.4S, v22.4S,v19.s[1] -add v24.4s, v24.4s, v14.4s -mul v20.4S, v20.4S,v19.s[2] -str q11, [x0, #304] -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v16.4s -mla v20.4S, v0.4S, v31.s[0] -str q24, [x0, #288] -mul v26.4S, v26.4S,v3.s[1] -str q2, [x0, #336] -mul v4.4S, v4.4S,v3.s[2] -add v23.4s, v23.4s, v16.4s -str q23, [x0, #320] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v28.4s -str q15, [x0, #368] -mla v4.4S, v25.4S, v31.s[0] -add v29.4s, v29.4s, v28.4s -str q29, [x0, #352] -sqrdmulh v12.4S, v18.4S, v30.s[0] -sub v3.4s, v13.4s, v22.4s -mul v18.4S, v18.4S,v5.s[0] -str q3, [x0, #400] -ldr q3, [x0, #560] -sqrdmulh v29.4S, v3.4S, v30.s[0] -add v13.4s, v13.4s, v22.4s -mul v3.4S, v3.4S,v5.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+416] -ldr q22, [x17, #+432] -ldr q28, [x0, #608] -sqrdmulh v25.4S, v28.4S, v22.s[0] -sub v15.4s, v7.4s, v20.4s -mul v28.4S, v28.4S,v13.s[0] -str q15, [x0, #432] -ldr q15, [x0, #624] -sqrdmulh v23.4S, v15.4S, v22.s[0] -add v7.4s, v7.4s, v20.4s -mul v15.4S, v15.4S,v13.s[0] -str q7, [x0, #416] -ldr q7, [x17, #+448] -ldr q20, [x17, #+464] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v26.4s -sqrdmulh v16.4S, v17.4S, v20.s[0] -str q12, [x0, #464] -ldr q12, [x0, #688] -mla v3.4S, v29.4S, v31.s[0] -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v20.s[0] -str q6, [x0, #448] -ldr q6, [x17, #+480] -ldr q29, [x17, #+496] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v4.4s -sqrdmulh v2.4S, v10.4S, v29.s[0] -str q25, [x0, #496] -ldr q25, [x0, #752] -mla v15.4S, v23.4S, v31.s[0] -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v29.s[0] -str q27, [x0, #480] -ldr q27, [x0, #512] -ldr q23, [x0, #640] -mul v17.4S, v17.4S,v7.s[0] -sub v21.4s, v27.4s, v18.4s -ldr q19, [x0, #528] -mul v12.4S, v12.4S,v7.s[0] -add v27.4s, v27.4s, v18.4s -ldr q18, [x0, #656] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v3.4s -ldr q24, [x0, #576] -mla v12.4S, v26.4S, v31.s[0] -add v19.4s, v19.4s, v3.4s -ldr q3, [x0, #704] -mul v10.4S, v10.4S,v6.s[0] -sub v26.4s, v24.4s, v28.4s -ldr q0, [x0, #592] -mul v25.4S, v25.4S,v6.s[0] -add v24.4s, v24.4s, v28.4s -ldr q28, [x0, #720] -mla v10.4S, v2.4S, v31.s[0] -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v15.4s -sqrdmulh v2.4S, v19.4S, v30.s[1] -add v0.4s, v0.4s, v15.4s -mul v19.4S, v19.4S,v5.s[1] -sqrdmulh v15.4S, v16.4S, v30.s[2] -sub v11.4s, v23.4s, v17.4s -mul v16.4S, v16.4S,v5.s[2] -add v23.4s, v23.4s, v17.4s -sqrdmulh v30.4S, v0.4S, v22.s[1] -sub v5.4s, v18.4s, v12.4s -mul v0.4S, v0.4S,v13.s[1] -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v22.s[2] -sub v17.4s, v3.4s, v10.4s -mul v4.4S, v4.4S,v13.s[2] -add v3.4s, v3.4s, v10.4s -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v28.4s, v25.4s -ldr q22, [x0, #992] -sqrdmulh v13.4S, v18.4S, v20.s[1] -add v28.4s, v28.4s, v25.4s -mla v16.4S, v15.4S, v31.s[0] -ldr q15, [x0, #928] -sqrdmulh v25.4S, v5.4S, v20.s[2] -sub v10.4s, v27.4s, v19.4s -mla v0.4S, v30.4S, v31.s[0] -ldr q30, [x0, #800] -sqrdmulh v14.4S, v28.4S, v29.s[1] -add v27.4s, v27.4s, v19.4s -str q10, [x0, #528] -mla v4.4S, v12.4S, v31.s[0] -ldr q12, [x17, #+512] -ldr q10, [x17, #+528] -sqrdmulh v19.4S, v2.4S, v29.s[2] -sub v1.4s, v21.4s, v16.4s -str q27, [x0, #512] -mul v18.4S, v18.4S,v7.s[1] -add v21.4s, v21.4s, v16.4s -mul v5.4S, v5.4S,v7.s[2] -str q1, [x0, #560] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v24.4s, v0.4s -mla v5.4S, v25.4S, v31.s[0] -str q21, [x0, #544] -mul v28.4S, v28.4S,v6.s[1] -str q13, [x0, #592] -mul v2.4S, v2.4S,v6.s[2] -add v24.4s, v24.4s, v0.4s -str q24, [x0, #576] -mla v28.4S, v14.4S, v31.s[0] -sub v14.4s, v26.4s, v4.4s -str q14, [x0, #624] -mla v2.4S, v19.4S, v31.s[0] -add v26.4s, v26.4s, v4.4s -str q26, [x0, #608] -sqrdmulh v29.4S, v30.4S, v10.s[0] -sub v6.4s, v23.4s, v18.4s -mul v30.4S, v30.4S,v12.s[0] -str q6, [x0, #656] -ldr q6, [x0, #816] -sqrdmulh v26.4S, v6.4S, v10.s[0] -add v23.4s, v23.4s, v18.4s -mul v6.4S, v6.4S,v12.s[0] -str q23, [x0, #640] -ldr q23, [x17, #+544] -ldr q18, [x17, #+560] -ldr q4, [x0, #864] -sqrdmulh v19.4S, v4.4S, v18.s[0] -sub v14.4s, v11.4s, v5.4s -mul v4.4S, v4.4S,v23.s[0] -str q14, [x0, #688] -ldr q14, [x0, #880] -sqrdmulh v24.4S, v14.4S, v18.s[0] -add v11.4s, v11.4s, v5.4s -mul v14.4S, v14.4S,v23.s[0] -str q11, [x0, #672] -ldr q11, [x17, #+576] -ldr q5, [x17, #+592] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v28.4s -sqrdmulh v0.4S, v15.4S, v5.s[0] -str q29, [x0, #720] -ldr q29, [x0, #944] -mla v6.4S, v26.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v5.s[0] -str q3, [x0, #704] -ldr q3, [x17, #+608] -ldr q26, [x17, #+624] -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v17.4s, v2.4s -sqrdmulh v13.4S, v22.4S, v26.s[0] -str q19, [x0, #752] -ldr q19, [x0, #1008] -mla v14.4S, v24.4S, v31.s[0] -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v26.s[0] -str q17, [x0, #736] -ldr q17, [x0, #768] -ldr q24, [x0, #896] -mul v15.4S, v15.4S,v11.s[0] -sub v20.4s, v17.4s, v30.4s -ldr q7, [x0, #784] -mul v29.4S, v29.4S,v11.s[0] -add v17.4s, v17.4s, v30.4s -ldr q30, [x0, #912] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v7.4s, v6.4s -ldr q21, [x0, #832] -mla v29.4S, v28.4S, v31.s[0] -add v7.4s, v7.4s, v6.4s -ldr q6, [x0, #960] -mul v22.4S, v22.4S,v3.s[0] -sub v28.4s, v21.4s, v4.4s -ldr q25, [x0, #848] -mul v19.4S, v19.4S,v3.s[0] -add v21.4s, v21.4s, v4.4s -ldr q4, [x0, #976] -mla v22.4S, v13.4S, v31.s[0] -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v14.4s -sqrdmulh v13.4S, v7.4S, v10.s[1] -add v25.4s, v25.4s, v14.4s -mul v7.4S, v7.4S,v12.s[1] -sqrdmulh v14.4S, v0.4S, v10.s[2] -sub v1.4s, v24.4s, v15.4s -mul v0.4S, v0.4S,v12.s[2] -add v24.4s, v24.4s, v15.4s -sqrdmulh v10.4S, v25.4S, v18.s[1] -sub v12.4s, v30.4s, v29.4s -mul v25.4S, v25.4S,v23.s[1] -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v2.4S, v18.s[2] -sub v15.4s, v6.4s, v22.4s -mul v2.4S, v2.4S,v23.s[2] -add v6.4s, v6.4s, v22.4s -mla v7.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v19.4s -sqrdmulh v18.4S, v30.4S, v5.s[1] -add v4.4s, v4.4s, v19.4s -mla v0.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v12.4S, v5.s[2] -sub v19.4s, v17.4s, v7.4s -mla v25.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v4.4S, v26.s[1] -add v17.4s, v17.4s, v7.4s -str q19, [x0, #784] -mla v2.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v13.4S, v26.s[2] -sub v19.4s, v20.4s, v0.4s -str q17, [x0, #768] -mul v30.4S, v30.4S,v11.s[1] -add v20.4s, v20.4s, v0.4s -mul v12.4S, v12.4S,v11.s[2] -str q19, [x0, #816] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v25.4s -mla v12.4S, v14.4S, v31.s[0] -str q20, [x0, #800] -mul v4.4S, v4.4S,v3.s[1] -str q18, [x0, #848] -mul v13.4S, v13.4S,v3.s[2] -add v21.4s, v21.4s, v25.4s -str q21, [x0, #832] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v2.4s -str q10, [x0, #880] -mla v13.4S, v29.4S, v31.s[0] -add v28.4s, v28.4s, v2.4s -str q28, [x0, #864] -sub v26.4s, v24.4s, v30.4s -str q26, [x0, #912] -add v24.4s, v24.4s, v30.4s -str q24, [x0, #896] -sub v24.4s, v1.4s, v12.4s -str q24, [x0, #944] -add v1.4s, v1.4s, v12.4s -str q1, [x0, #928] -sub v1.4s, v6.4s, v4.4s -str q1, [x0, #976] -add v6.4s, v6.4s, v4.4s -str q6, [x0, #960] -sub v6.4s, v15.4s, v13.4s -str q6, [x0, #1008] -add v15.4s, v15.4s, v13.4s -str q15, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s deleted file mode 100644 index 70d520f..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_8.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_8: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #224] -ldr q1, [x0, #160] -ldr q10, [x0, #32] -ldr q21, [x17, #+128] -ldr q22, [x17, #+144] -sqrdmulh v15.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v21.s[0] -ldr q3, [x0, #48] -sqrdmulh v12.4S, v3.4S, v22.s[0] -ldr q4, [x17, #+160] -mul v3.4S, v3.4S,v21.s[0] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v4.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v4.s[0] -ldr q25, [x17, #+192] -mla v10.4S, v15.4S, v31.s[0] -ldr q15, [x17, #+208] -sqrdmulh v24.4S, v1.4S, v15.s[0] -ldr q20, [x0, #176] -mla v3.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v20.4S, v15.s[0] -ldr q6, [x17, #+224] -mla v29.4S, v28.4S, v31.s[0] -ldr q28, [x17, #+240] -sqrdmulh v5.4S, v18.4S, v28.s[0] -ldr q16, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v16.4S, v28.s[0] -ldr q11, [x0, #0] -ldr q2, [x0, #128] -mul v1.4S, v1.4S,v25.s[0] -sub v23.4s, v11.4s, v10.4s -ldr q0, [x0, #16] -mul v20.4S, v20.4S,v25.s[0] -add v11.4s, v11.4s, v10.4s -ldr q10, [x0, #144] -mla v1.4S, v24.4S, v31.s[0] -sub v24.4s, v0.4s, v3.4s -ldr q13, [x0, #64] -mla v20.4S, v12.4S, v31.s[0] -add v0.4s, v0.4s, v3.4s -ldr q3, [x0, #192] -mul v18.4S, v18.4S,v6.s[0] -sub v12.4s, v13.4s, v29.4s -ldr q14, [x0, #80] -mul v16.4S, v16.4S,v6.s[0] -add v13.4s, v13.4s, v29.4s -ldr q29, [x0, #208] -mla v18.4S, v5.4S, v31.s[0] -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v27.4s -sqrdmulh v5.4S, v0.4S, v22.s[1] -add v14.4s, v14.4s, v27.4s -mul v0.4S, v0.4S,v21.s[1] -sqrdmulh v27.4S, v24.4S, v22.s[2] -sub v19.4s, v2.4s, v1.4s -mul v24.4S, v24.4S,v21.s[2] -add v2.4s, v2.4s, v1.4s -sqrdmulh v22.4S, v14.4S, v30.s[1] -sub v21.4s, v10.4s, v20.4s -mul v14.4S, v14.4S,v4.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v30.s[2] -sub v1.4s, v3.4s, v18.4s -mul v26.4S, v26.4S,v4.s[2] -add v3.4s, v3.4s, v18.4s -mla v0.4S, v5.4S, v31.s[0] -sub v5.4s, v29.4s, v16.4s -ldr q30, [x0, #480] -sqrdmulh v4.4S, v10.4S, v15.s[1] -add v29.4s, v29.4s, v16.4s -mla v24.4S, v27.4S, v31.s[0] -ldr q27, [x0, #416] -sqrdmulh v16.4S, v21.4S, v15.s[2] -sub v18.4s, v11.4s, v0.4s -mla v14.4S, v22.4S, v31.s[0] -ldr q22, [x0, #288] -sqrdmulh v17.4S, v29.4S, v28.s[1] -add v11.4s, v11.4s, v0.4s -str q18, [x0, #16] -mla v26.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -sqrdmulh v18.4S, v5.4S, v28.s[2] -sub v0.4s, v23.4s, v24.4s -str q11, [x0, #0] -mul v10.4S, v10.4S,v25.s[1] -add v23.4s, v23.4s, v24.4s -ldr q24, [x17, #+272] -mul v21.4S, v21.4S,v25.s[2] -str q0, [x0, #48] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v14.4s -mla v21.4S, v16.4S, v31.s[0] -str q23, [x0, #32] -mul v29.4S, v29.4S,v6.s[1] -str q4, [x0, #80] -mul v5.4S, v5.4S,v6.s[2] -add v13.4s, v13.4s, v14.4s -str q13, [x0, #64] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v26.4s -str q17, [x0, #112] -mla v5.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -str q12, [x0, #96] -sqrdmulh v28.4S, v22.4S, v24.s[0] -sub v6.4s, v2.4s, v10.4s -mul v22.4S, v22.4S,v20.s[0] -str q6, [x0, #144] -ldr q6, [x0, #304] -sqrdmulh v12.4S, v6.4S, v24.s[0] -add v2.4s, v2.4s, v10.4s -ldr q10, [x17, #+288] -mul v6.4S, v6.4S,v20.s[0] -str q2, [x0, #128] -ldr q2, [x17, #+304] -ldr q26, [x0, #352] -sqrdmulh v18.4S, v26.4S, v2.s[0] -sub v17.4s, v19.4s, v21.4s -mul v26.4S, v26.4S,v10.s[0] -str q17, [x0, #176] -ldr q17, [x0, #368] -sqrdmulh v13.4S, v17.4S, v2.s[0] -add v19.4s, v19.4s, v21.4s -mul v17.4S, v17.4S,v10.s[0] -str q19, [x0, #160] -ldr q19, [x17, #+320] -mla v22.4S, v28.4S, v31.s[0] -sub v28.4s, v3.4s, v29.4s -ldr q21, [x17, #+336] -sqrdmulh v14.4S, v27.4S, v21.s[0] -str q28, [x0, #208] -ldr q28, [x0, #432] -mla v6.4S, v12.4S, v31.s[0] -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v28.4S, v21.s[0] -str q3, [x0, #192] -ldr q3, [x17, #+352] -mla v26.4S, v18.4S, v31.s[0] -sub v18.4s, v1.4s, v5.4s -ldr q12, [x17, #+368] -sqrdmulh v4.4S, v30.4S, v12.s[0] -str q18, [x0, #240] -ldr q18, [x0, #496] -mla v17.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v12.s[0] -str q1, [x0, #224] -ldr q1, [x0, #256] -ldr q13, [x0, #384] -mul v27.4S, v27.4S,v19.s[0] -sub v15.4s, v1.4s, v22.4s -ldr q25, [x0, #272] -mul v28.4S, v28.4S,v19.s[0] -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #400] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v25.4s, v6.4s -ldr q23, [x0, #320] -mla v28.4S, v29.4S, v31.s[0] -add v25.4s, v25.4s, v6.4s -ldr q6, [x0, #448] -mul v30.4S, v30.4S,v3.s[0] -sub v29.4s, v23.4s, v26.4s -ldr q16, [x0, #336] -mul v18.4S, v18.4S,v3.s[0] -add v23.4s, v23.4s, v26.4s -ldr q26, [x0, #464] -mla v30.4S, v4.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v17.4s -sqrdmulh v4.4S, v25.4S, v24.s[1] -add v16.4s, v16.4s, v17.4s -mul v25.4S, v25.4S,v20.s[1] -sqrdmulh v17.4S, v14.4S, v24.s[2] -sub v0.4s, v13.4s, v27.4s -mul v14.4S, v14.4S,v20.s[2] -add v13.4s, v13.4s, v27.4s -sqrdmulh v24.4S, v16.4S, v2.s[1] -sub v20.4s, v22.4s, v28.4s -mul v16.4S, v16.4S,v10.s[1] -add v22.4s, v22.4s, v28.4s -sqrdmulh v28.4S, v5.4S, v2.s[2] -sub v27.4s, v6.4s, v30.4s -mul v5.4S, v5.4S,v10.s[2] -add v6.4s, v6.4s, v30.4s -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v18.4s -ldr q2, [x0, #736] -sqrdmulh v10.4S, v22.4S, v21.s[1] -add v26.4s, v26.4s, v18.4s -mla v14.4S, v17.4S, v31.s[0] -ldr q17, [x0, #672] -sqrdmulh v18.4S, v20.4S, v21.s[2] -sub v30.4s, v1.4s, v25.4s -mla v16.4S, v24.4S, v31.s[0] -ldr q24, [x0, #544] -sqrdmulh v11.4S, v26.4S, v12.s[1] -add v1.4s, v1.4s, v25.4s -str q30, [x0, #272] -mla v5.4S, v28.4S, v31.s[0] -ldr q28, [x17, #+384] -sqrdmulh v30.4S, v4.4S, v12.s[2] -sub v25.4s, v15.4s, v14.4s -str q1, [x0, #256] -mul v22.4S, v22.4S,v19.s[1] -add v15.4s, v15.4s, v14.4s -ldr q14, [x17, #+400] -mul v20.4S, v20.4S,v19.s[2] -str q25, [x0, #304] -mla v22.4S, v10.4S, v31.s[0] -sub v10.4s, v23.4s, v16.4s -mla v20.4S, v18.4S, v31.s[0] -str q15, [x0, #288] -mul v26.4S, v26.4S,v3.s[1] -str q10, [x0, #336] -mul v4.4S, v4.4S,v3.s[2] -add v23.4s, v23.4s, v16.4s -str q23, [x0, #320] -mla v26.4S, v11.4S, v31.s[0] -sub v11.4s, v29.4s, v5.4s -str q11, [x0, #368] -mla v4.4S, v30.4S, v31.s[0] -add v29.4s, v29.4s, v5.4s -str q29, [x0, #352] -sqrdmulh v12.4S, v24.4S, v14.s[0] -sub v3.4s, v13.4s, v22.4s -mul v24.4S, v24.4S,v28.s[0] -str q3, [x0, #400] -ldr q3, [x0, #560] -sqrdmulh v29.4S, v3.4S, v14.s[0] -add v13.4s, v13.4s, v22.4s -ldr q22, [x17, #+416] -mul v3.4S, v3.4S,v28.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+432] -ldr q5, [x0, #608] -sqrdmulh v30.4S, v5.4S, v13.s[0] -sub v11.4s, v0.4s, v20.4s -mul v5.4S, v5.4S,v22.s[0] -str q11, [x0, #432] -ldr q11, [x0, #624] -sqrdmulh v23.4S, v11.4S, v13.s[0] -add v0.4s, v0.4s, v20.4s -mul v11.4S, v11.4S,v22.s[0] -str q0, [x0, #416] -ldr q0, [x17, #+448] -mla v24.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v26.4s -ldr q20, [x17, #+464] -sqrdmulh v16.4S, v17.4S, v20.s[0] -str q12, [x0, #464] -ldr q12, [x0, #688] -mla v3.4S, v29.4S, v31.s[0] -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v20.s[0] -str q6, [x0, #448] -ldr q6, [x17, #+480] -mla v5.4S, v30.4S, v31.s[0] -sub v30.4s, v27.4s, v4.4s -ldr q29, [x17, #+496] -sqrdmulh v10.4S, v2.4S, v29.s[0] -str q30, [x0, #496] -ldr q30, [x0, #752] -mla v11.4S, v23.4S, v31.s[0] -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v30.4S, v29.s[0] -str q27, [x0, #480] -ldr q27, [x0, #512] -ldr q23, [x0, #640] -mul v17.4S, v17.4S,v0.s[0] -sub v21.4s, v27.4s, v24.4s -ldr q19, [x0, #528] -mul v12.4S, v12.4S,v0.s[0] -add v27.4s, v27.4s, v24.4s -ldr q24, [x0, #656] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v3.4s -ldr q15, [x0, #576] -mla v12.4S, v26.4S, v31.s[0] -add v19.4s, v19.4s, v3.4s -ldr q3, [x0, #704] -mul v2.4S, v2.4S,v6.s[0] -sub v26.4s, v15.4s, v5.4s -ldr q18, [x0, #592] -mul v30.4S, v30.4S,v6.s[0] -add v15.4s, v15.4s, v5.4s -ldr q5, [x0, #720] -mla v2.4S, v10.4S, v31.s[0] -mla v30.4S, v4.4S, v31.s[0] -sub v4.4s, v18.4s, v11.4s -sqrdmulh v10.4S, v19.4S, v14.s[1] -add v18.4s, v18.4s, v11.4s -mul v19.4S, v19.4S,v28.s[1] -sqrdmulh v11.4S, v16.4S, v14.s[2] -sub v25.4s, v23.4s, v17.4s -mul v16.4S, v16.4S,v28.s[2] -add v23.4s, v23.4s, v17.4s -sqrdmulh v14.4S, v18.4S, v13.s[1] -sub v28.4s, v24.4s, v12.4s -mul v18.4S, v18.4S,v22.s[1] -add v24.4s, v24.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v13.s[2] -sub v17.4s, v3.4s, v2.4s -mul v4.4S, v4.4S,v22.s[2] -add v3.4s, v3.4s, v2.4s -mla v19.4S, v10.4S, v31.s[0] -sub v10.4s, v5.4s, v30.4s -ldr q13, [x0, #992] -sqrdmulh v22.4S, v24.4S, v20.s[1] -add v5.4s, v5.4s, v30.4s -mla v16.4S, v11.4S, v31.s[0] -ldr q11, [x0, #928] -sqrdmulh v30.4S, v28.4S, v20.s[2] -sub v2.4s, v27.4s, v19.4s -mla v18.4S, v14.4S, v31.s[0] -ldr q14, [x0, #800] -sqrdmulh v1.4S, v5.4S, v29.s[1] -add v27.4s, v27.4s, v19.4s -str q2, [x0, #528] -mla v4.4S, v12.4S, v31.s[0] -ldr q12, [x17, #+512] -sqrdmulh v2.4S, v10.4S, v29.s[2] -sub v19.4s, v21.4s, v16.4s -str q27, [x0, #512] -mul v24.4S, v24.4S,v0.s[1] -add v21.4s, v21.4s, v16.4s -ldr q16, [x17, #+528] -mul v28.4S, v28.4S,v0.s[2] -str q19, [x0, #560] -mla v24.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v18.4s -mla v28.4S, v30.4S, v31.s[0] -str q21, [x0, #544] -mul v5.4S, v5.4S,v6.s[1] -str q22, [x0, #592] -mul v10.4S, v10.4S,v6.s[2] -add v15.4s, v15.4s, v18.4s -str q15, [x0, #576] -mla v5.4S, v1.4S, v31.s[0] -sub v1.4s, v26.4s, v4.4s -str q1, [x0, #624] -mla v10.4S, v2.4S, v31.s[0] -add v26.4s, v26.4s, v4.4s -str q26, [x0, #608] -sqrdmulh v29.4S, v14.4S, v16.s[0] -sub v6.4s, v23.4s, v24.4s -mul v14.4S, v14.4S,v12.s[0] -str q6, [x0, #656] -ldr q6, [x0, #816] -sqrdmulh v26.4S, v6.4S, v16.s[0] -add v23.4s, v23.4s, v24.4s -ldr q24, [x17, #+544] -mul v6.4S, v6.4S,v12.s[0] -str q23, [x0, #640] -ldr q23, [x17, #+560] -ldr q4, [x0, #864] -sqrdmulh v2.4S, v4.4S, v23.s[0] -sub v1.4s, v25.4s, v28.4s -mul v4.4S, v4.4S,v24.s[0] -str q1, [x0, #688] -ldr q1, [x0, #880] -sqrdmulh v15.4S, v1.4S, v23.s[0] -add v25.4s, v25.4s, v28.4s -mul v1.4S, v1.4S,v24.s[0] -str q25, [x0, #672] -ldr q25, [x17, #+576] -mla v14.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v5.4s -ldr q28, [x17, #+592] -sqrdmulh v18.4S, v11.4S, v28.s[0] -str q29, [x0, #720] -ldr q29, [x0, #944] -mla v6.4S, v26.4S, v31.s[0] -add v3.4s, v3.4s, v5.4s -sqrdmulh v5.4S, v29.4S, v28.s[0] -str q3, [x0, #704] -ldr q3, [x17, #+608] -mla v4.4S, v2.4S, v31.s[0] -sub v2.4s, v17.4s, v10.4s -ldr q26, [x17, #+624] -sqrdmulh v22.4S, v13.4S, v26.s[0] -str q2, [x0, #752] -ldr q2, [x0, #1008] -mla v1.4S, v15.4S, v31.s[0] -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v2.4S, v26.s[0] -str q17, [x0, #736] -ldr q17, [x0, #768] -ldr q15, [x0, #896] -mul v11.4S, v11.4S,v25.s[0] -sub v20.4s, v17.4s, v14.4s -ldr q0, [x0, #784] -mul v29.4S, v29.4S,v25.s[0] -add v17.4s, v17.4s, v14.4s -ldr q14, [x0, #912] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v0.4s, v6.4s -ldr q21, [x0, #832] -mla v29.4S, v5.4S, v31.s[0] -add v0.4s, v0.4s, v6.4s -ldr q6, [x0, #960] -mul v13.4S, v13.4S,v3.s[0] -sub v5.4s, v21.4s, v4.4s -ldr q30, [x0, #848] -mul v2.4S, v2.4S,v3.s[0] -add v21.4s, v21.4s, v4.4s -ldr q4, [x0, #976] -mla v13.4S, v22.4S, v31.s[0] -mla v2.4S, v10.4S, v31.s[0] -sub v10.4s, v30.4s, v1.4s -sqrdmulh v22.4S, v0.4S, v16.s[1] -add v30.4s, v30.4s, v1.4s -mul v0.4S, v0.4S,v12.s[1] -sqrdmulh v1.4S, v18.4S, v16.s[2] -sub v19.4s, v15.4s, v11.4s -mul v18.4S, v18.4S,v12.s[2] -add v15.4s, v15.4s, v11.4s -sqrdmulh v16.4S, v30.4S, v23.s[1] -sub v12.4s, v14.4s, v29.4s -mul v30.4S, v30.4S,v24.s[1] -add v14.4s, v14.4s, v29.4s -sqrdmulh v29.4S, v10.4S, v23.s[2] -sub v11.4s, v6.4s, v13.4s -mul v10.4S, v10.4S,v24.s[2] -add v6.4s, v6.4s, v13.4s -mla v0.4S, v22.4S, v31.s[0] -sub v22.4s, v4.4s, v2.4s -sqrdmulh v23.4S, v14.4S, v28.s[1] -add v4.4s, v4.4s, v2.4s -mla v18.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v12.4S, v28.s[2] -sub v2.4s, v17.4s, v0.4s -mla v30.4S, v16.4S, v31.s[0] -sqrdmulh v16.4S, v4.4S, v26.s[1] -add v17.4s, v17.4s, v0.4s -str q2, [x0, #784] -mla v10.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v22.4S, v26.s[2] -sub v2.4s, v20.4s, v18.4s -str q17, [x0, #768] -mul v14.4S, v14.4S,v25.s[1] -add v20.4s, v20.4s, v18.4s -mul v12.4S, v12.4S,v25.s[2] -str q2, [x0, #816] -mla v14.4S, v23.4S, v31.s[0] -sub v23.4s, v21.4s, v30.4s -mla v12.4S, v1.4S, v31.s[0] -str q20, [x0, #800] -mul v4.4S, v4.4S,v3.s[1] -str q23, [x0, #848] -mul v22.4S, v22.4S,v3.s[2] -add v21.4s, v21.4s, v30.4s -str q21, [x0, #832] -mla v4.4S, v16.4S, v31.s[0] -sub v16.4s, v5.4s, v10.4s -str q16, [x0, #880] -mla v22.4S, v29.4S, v31.s[0] -add v5.4s, v5.4s, v10.4s -str q5, [x0, #864] -sub v26.4s, v15.4s, v14.4s -str q26, [x0, #912] -add v15.4s, v15.4s, v14.4s -str q15, [x0, #896] -sub v15.4s, v19.4s, v12.4s -str q15, [x0, #944] -add v19.4s, v19.4s, v12.4s -str q19, [x0, #928] -sub v19.4s, v6.4s, v4.4s -str q19, [x0, #976] -add v6.4s, v6.4s, v4.4s -str q6, [x0, #960] -sub v6.4s, v11.4s, v22.4s -str q6, [x0, #1008] -add v11.4s, v11.4s, v22.4s -str q11, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s deleted file mode 100644 index 24f5a8a..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_7_z4_9.s +++ /dev/null @@ -1,1502 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9 -.global _ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9 -ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9: -_ntt_u32_incomplete_neon_asm_var_4_2_7_z4_9: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v1.4s, v16.4s, v3.4s -str q20, [x0, #416] -ldr q20, [x0, #1008] -sqrdmulh v14.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v16.4s, v16.4s, v3.4s -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -mul v11.4S, v11.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v20.4S, v14.4S, v31.s[0] -sub v14.4s, v22.4s, v15.4s -str q18, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -add v22.4s, v22.4s, v15.4s -str q2, [x0, #224] -ldr q2, [x0, #560] -sqrdmulh v15.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v9.4s, v0.4s, v17.4s -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -add v0.4s, v0.4s, v17.4s -str q1, [x0, #96] -ldr q1, [x0, #688] -ldr q17, [x0, #432] -sqrdmulh v18.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -sub v7.4s, v17.4s, v19.4s -add v17.4s, v17.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v20.4s -add v6.4s, v6.4s, v20.4s -ldr q20, [x0, #304] -mla v2.4S, v15.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -sub v3.4s, v20.4s, v11.4s -str q10, [x0, #544] -mla v1.4S, v18.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -add v20.4s, v20.4s, v11.4s -str q13, [x0, #608] -ldr q13, [x0, #368] -sqrdmulh v11.4S, v17.4S, v29.s[1] -mul v17.4S, v17.4S,v30.s[1] -sub v5.4s, v13.4s, v8.4s -str q21, [x0, #672] -sqrdmulh v21.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -add v13.4s, v13.4s, v8.4s -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v18.4s, v12.4s, v2.4s -add v12.4s, v12.4s, v2.4s -ldr q2, [x0, #112] -sqrdmulh v10.4S, v13.4S, v29.s[1] -mul v13.4S, v13.4S,v30.s[1] -sub v15.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -ldr q16, [x0, #176] -mla v17.4S, v11.4S, v31.s[0] -mla v6.4S, v21.4S, v31.s[0] -sub v21.4s, v16.4s, v1.4s -str q22, [x0, #800] -mla v20.4S, v8.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -add v16.4s, v16.4s, v1.4s -str q14, [x0, #864] -ldr q14, [x0, #240] -sqrdmulh v1.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -sub v10.4s, v14.4s, v19.4s -str q0, [x0, #928] -sqrdmulh v0.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -add v14.4s, v14.4s, v19.4s -str q9, [x0, #992] -sqrdmulh v9.4S, v3.4S, v29.s[2] -mul v3.4S, v3.4S,v30.s[2] -sub v19.4s, v16.4s, v17.4s -add v16.4s, v16.4s, v17.4s -sqrdmulh v17.4S, v5.4S, v29.s[2] -mul v5.4S, v5.4S,v30.s[2] -sub v8.4s, v14.4s, v6.4s -add v14.4s, v14.4s, v6.4s -mla v7.4S, v1.4S, v31.s[0] -mla v4.4S, v0.4S, v31.s[0] -sub v0.4s, v12.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v5.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -sqrdmulh v20.4S, v19.4S, v27.s[1] -mul v19.4S, v19.4S,v28.s[1] -sub v17.4s, v2.4s, v13.4s -sqrdmulh v9.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -add v2.4s, v2.4s, v13.4s -sqrdmulh v13.4S, v16.4S, v27.s[0] -mul v16.4S, v16.4S,v28.s[0] -sub v1.4s, v21.4s, v7.4s -add v21.4s, v21.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[0] -mul v14.4S, v14.4S,v28.s[0] -sub v6.4s, v10.4s, v4.4s -add v10.4s, v10.4s, v4.4s -mla v19.4S, v20.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -sub v9.4s, v18.4s, v3.4s -mla v16.4S, v13.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v18.4s, v18.4s, v3.4s -sqrdmulh v3.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -sub v7.4s, v15.4s, v5.4s -sqrdmulh v13.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v1.4S, v27.s[3] -mul v1.4S, v1.4S,v28.s[3] -sub v20.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v17.4s, v8.4s -add v17.4s, v17.4s, v8.4s -mla v21.4S, v3.4S, v31.s[0] -mla v10.4S, v13.4S, v31.s[0] -sub v13.4s, v12.4s, v16.4s -mla v1.4S, v5.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -sqrdmulh v16.4S, v17.4S, v25.s[2] -mul v17.4S, v17.4S,v26.s[2] -sub v19.4s, v2.4s, v14.4s -sqrdmulh v5.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v2.4s, v2.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v25.s[1] -mul v19.4S, v19.4S,v26.s[1] -sub v3.4s, v18.4s, v21.4s -add v18.4s, v18.4s, v21.4s -sqrdmulh v21.4S, v2.4S, v25.s[0] -mul v2.4S, v2.4S,v26.s[0] -sub v8.4s, v15.4s, v10.4s -add v15.4s, v15.4s, v10.4s -mla v17.4S, v16.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -sub v5.4s, v9.4s, v1.4s -mla v19.4S, v14.4S, v31.s[0] -mla v2.4S, v21.4S, v31.s[0] -add v9.4s, v9.4s, v1.4s -sqrdmulh v1.4S, v15.4S, v23.s[0] -mul v15.4S, v15.4S,v24.s[0] -sub v21.4s, v7.4s, v6.4s -sqrdmulh v14.4S, v8.4S, v23.s[1] -mul v8.4S, v8.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v16.4s, v0.4s, v17.4s -add v0.4s, v0.4s, v17.4s -sqrdmulh v17.4S, v21.4S, v23.s[3] -mul v21.4S, v21.4S,v24.s[3] -sub v10.4s, v20.4s, v4.4s -add v20.4s, v20.4s, v4.4s -mla v15.4S, v1.4S, v31.s[0] -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v19.4s -str q0, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v21.4S, v17.4S, v31.s[0] -add v13.4s, v13.4s, v19.4s -str q16, [x0, #368] -ldr q16, [x0, #896] -sqrdmulh v19.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v17.4s, v12.4s, v2.4s -str q20, [x0, #432] -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v12.4s, v12.4s, v2.4s -str q10, [x0, #496] -ldr q10, [x0, #768] -sqrdmulh v2.4S, v10.4S, v29.s[0] -mul v10.4S, v10.4S,v30.s[0] -sub v0.4s, v18.4s, v15.4s -add v18.4s, v18.4s, v15.4s -ldr q15, [x0, #832] -sqrdmulh v1.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v4.4s, v3.4s, v8.4s -add v3.4s, v3.4s, v8.4s -mla v16.4S, v19.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -sub v6.4s, v9.4s, v7.4s -str q13, [x0, #176] -mla v10.4S, v2.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -add v9.4s, v9.4s, v7.4s -str q14, [x0, #240] -ldr q14, [x0, #512] -sqrdmulh v7.4S, v14.4S, v29.s[0] -mul v14.4S, v14.4S,v30.s[0] -sub v1.4s, v5.4s, v21.4s -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v2.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -add v5.4s, v5.4s, v21.4s -str q17, [x0, #112] -ldr q17, [x0, #640] -ldr q21, [x0, #384] -sqrdmulh v13.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v19.4s, v21.4s, v16.4s -add v21.4s, v21.4s, v16.4s -ldr q16, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -sub v11.4s, v8.4s, v20.4s -add v8.4s, v8.4s, v20.4s -ldr q20, [x0, #256] -mla v14.4S, v7.4S, v31.s[0] -mla v12.4S, v2.4S, v31.s[0] -sub v2.4s, v20.4s, v10.4s -str q18, [x0, #560] -mla v17.4S, v13.4S, v31.s[0] -mla v16.4S, v22.4S, v31.s[0] -add v20.4s, v20.4s, v10.4s -str q0, [x0, #624] -ldr q0, [x0, #320] -sqrdmulh v10.4S, v21.4S, v29.s[1] -mul v21.4S, v21.4S,v30.s[1] -sub v22.4s, v0.4s, v15.4s -str q3, [x0, #688] -sqrdmulh v3.4S, v8.4S, v29.s[1] -mul v8.4S, v8.4S,v30.s[1] -add v0.4s, v0.4s, v15.4s -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v15.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v13.4s, v4.4s, v14.4s -add v4.4s, v4.4s, v14.4s -ldr q14, [x0, #64] -sqrdmulh v18.4S, v0.4S, v29.s[1] -mul v0.4S, v0.4S,v30.s[1] -sub v7.4s, v14.4s, v12.4s -add v14.4s, v14.4s, v12.4s -ldr q12, [x0, #128] -mla v21.4S, v10.4S, v31.s[0] -mla v8.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v17.4s -str q9, [x0, #816] -mla v20.4S, v15.4S, v31.s[0] -mla v0.4S, v18.4S, v31.s[0] -add v12.4s, v12.4s, v17.4s -str q6, [x0, #880] -ldr q6, [x0, #192] -sqrdmulh v17.4S, v19.4S, v29.s[2] -mul v19.4S, v19.4S,v30.s[2] -sub v18.4s, v6.4s, v16.4s -str q5, [x0, #944] -sqrdmulh v5.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -add v6.4s, v6.4s, v16.4s -str q1, [x0, #1008] -sqrdmulh v1.4S, v2.4S, v29.s[2] -mul v2.4S, v2.4S,v30.s[2] -sub v16.4s, v12.4s, v21.4s -add v12.4s, v12.4s, v21.4s -sqrdmulh v21.4S, v22.4S, v29.s[2] -mul v22.4S, v22.4S,v30.s[2] -sub v15.4s, v6.4s, v8.4s -add v6.4s, v6.4s, v8.4s -mla v19.4S, v17.4S, v31.s[0] -mla v11.4S, v5.4S, v31.s[0] -sub v5.4s, v4.4s, v20.4s -mla v2.4S, v1.4S, v31.s[0] -mla v22.4S, v21.4S, v31.s[0] -add v4.4s, v4.4s, v20.4s -sqrdmulh v20.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v21.4s, v14.4s, v0.4s -sqrdmulh v1.4S, v15.4S, v27.s[1] -mul v15.4S, v15.4S,v28.s[1] -add v14.4s, v14.4s, v0.4s -sqrdmulh v0.4S, v12.4S, v27.s[0] -mul v12.4S, v12.4S,v28.s[0] -sub v17.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -sqrdmulh v19.4S, v6.4S, v27.s[0] -mul v6.4S, v6.4S,v28.s[0] -sub v8.4s, v18.4s, v11.4s -add v18.4s, v18.4s, v11.4s -mla v16.4S, v20.4S, v31.s[0] -mla v15.4S, v1.4S, v31.s[0] -sub v1.4s, v13.4s, v2.4s -mla v12.4S, v0.4S, v31.s[0] -mla v6.4S, v19.4S, v31.s[0] -add v13.4s, v13.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v27.s[2] -mul v3.4S, v3.4S,v28.s[2] -sub v19.4s, v7.4s, v22.4s -sqrdmulh v0.4S, v18.4S, v27.s[2] -mul v18.4S, v18.4S,v28.s[2] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v17.4S, v27.s[3] -mul v17.4S, v17.4S,v28.s[3] -sub v20.4s, v5.4s, v16.4s -add v5.4s, v5.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[3] -mul v8.4S, v8.4S,v28.s[3] -sub v11.4s, v21.4s, v15.4s -add v21.4s, v21.4s, v15.4s -mla v3.4S, v2.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -sub v0.4s, v4.4s, v12.4s -mla v17.4S, v22.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -sqrdmulh v12.4S, v21.4S, v25.s[2] -mul v21.4S, v21.4S,v26.s[2] -sub v16.4s, v14.4s, v6.4s -sqrdmulh v22.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v14.4s, v14.4s, v6.4s -sqrdmulh v6.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v2.4s, v13.4s, v3.4s -add v13.4s, v13.4s, v3.4s -sqrdmulh v3.4S, v14.4S, v25.s[0] -mul v14.4S, v14.4S,v26.s[0] -sub v15.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -mla v21.4S, v12.4S, v31.s[0] -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v1.4s, v17.4s -mla v16.4S, v6.4S, v31.s[0] -mla v14.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -sqrdmulh v17.4S, v7.4S, v23.s[0] -mul v7.4S, v7.4S,v24.s[0] -sub v3.4s, v19.4s, v8.4s -sqrdmulh v6.4S, v15.4S, v23.s[1] -mul v15.4S, v15.4S,v24.s[1] -add v19.4s, v19.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v23.s[2] -mul v19.4S, v19.4S,v24.s[2] -sub v12.4s, v5.4s, v21.4s -add v5.4s, v5.4s, v21.4s -sqrdmulh v21.4S, v3.4S, v23.s[3] -mul v3.4S, v3.4S,v24.s[3] -sub v18.4s, v20.4s, v11.4s -add v20.4s, v20.4s, v11.4s -mla v7.4S, v17.4S, v31.s[0] -mla v15.4S, v6.4S, v31.s[0] -sub v6.4s, v0.4s, v16.4s -str q5, [x0, #256] -mla v19.4S, v8.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -add v0.4s, v0.4s, v16.4s -str q12, [x0, #320] -ldr q12, [x0, #912] -sqrdmulh v16.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v21.4s, v4.4s, v14.4s -str q20, [x0, #384] -ldr q20, [x0, #976] -sqrdmulh v8.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -add v4.4s, v4.4s, v14.4s -str q18, [x0, #448] -ldr q18, [x0, #784] -sqrdmulh v14.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v5.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -ldr q7, [x0, #848] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v11.4s, v2.4s, v15.4s -add v2.4s, v2.4s, v15.4s -mla v12.4S, v16.4S, v31.s[0] -mla v20.4S, v8.4S, v31.s[0] -sub v8.4s, v1.4s, v19.4s -str q0, [x0, #128] -mla v18.4S, v14.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -add v1.4s, v1.4s, v19.4s -str q6, [x0, #192] -ldr q6, [x0, #528] -sqrdmulh v19.4S, v6.4S, v29.s[0] -mul v6.4S, v6.4S,v30.s[0] -sub v17.4s, v22.4s, v3.4s -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v14.4S, v4.4S, v29.s[0] -mul v4.4S, v4.4S,v30.s[0] -add v22.4s, v22.4s, v3.4s -str q21, [x0, #64] -ldr q21, [x0, #656] -ldr q3, [x0, #400] -sqrdmulh v0.4S, v21.4S, v29.s[0] -mul v21.4S, v21.4S,v30.s[0] -sub v16.4s, v3.4s, v12.4s -add v3.4s, v3.4s, v12.4s -ldr q12, [x0, #720] -ldr q15, [x0, #464] -sqrdmulh v9.4S, v12.4S, v29.s[0] -mul v12.4S, v12.4S,v30.s[0] -sub v10.4s, v15.4s, v20.4s -add v15.4s, v15.4s, v20.4s -ldr q20, [x0, #272] -mla v6.4S, v19.4S, v31.s[0] -mla v4.4S, v14.4S, v31.s[0] -sub v14.4s, v20.4s, v18.4s -str q13, [x0, #512] -mla v21.4S, v0.4S, v31.s[0] -mla v12.4S, v9.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -str q5, [x0, #576] -ldr q5, [x0, #336] -sqrdmulh v18.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v9.4s, v5.4s, v7.4s -str q2, [x0, #640] -sqrdmulh v2.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -add v5.4s, v5.4s, v7.4s -str q11, [x0, #704] -ldr q11, [x0, #16] -sqrdmulh v7.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v0.4s, v11.4s, v6.4s -add v11.4s, v11.4s, v6.4s -ldr q6, [x0, #80] -sqrdmulh v13.4S, v5.4S, v29.s[1] -mul v5.4S, v5.4S,v30.s[1] -sub v19.4s, v6.4s, v4.4s -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #144] -mla v3.4S, v18.4S, v31.s[0] -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v4.4s, v21.4s -str q1, [x0, #768] -mla v20.4S, v7.4S, v31.s[0] -mla v5.4S, v13.4S, v31.s[0] -add v4.4s, v4.4s, v21.4s -str q8, [x0, #832] -ldr q8, [x0, #208] -sqrdmulh v21.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -sub v13.4s, v8.4s, v12.4s -str q22, [x0, #896] -sqrdmulh v22.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -add v8.4s, v8.4s, v12.4s -str q17, [x0, #960] -sqrdmulh v17.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v12.4s, v4.4s, v3.4s -add v4.4s, v4.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v29.s[2] -mul v9.4S, v9.4S,v30.s[2] -sub v7.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -mla v16.4S, v21.4S, v31.s[0] -mla v10.4S, v22.4S, v31.s[0] -sub v22.4s, v11.4s, v20.4s -mla v14.4S, v17.4S, v31.s[0] -mla v9.4S, v3.4S, v31.s[0] -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -sub v3.4s, v6.4s, v5.4s -sqrdmulh v17.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -add v6.4s, v6.4s, v5.4s -sqrdmulh v5.4S, v4.4S, v27.s[0] -mul v4.4S, v4.4S,v28.s[0] -sub v21.4s, v2.4s, v16.4s -add v2.4s, v2.4s, v16.4s -sqrdmulh v16.4S, v8.4S, v27.s[0] -mul v8.4S, v8.4S,v28.s[0] -sub v15.4s, v13.4s, v10.4s -add v13.4s, v13.4s, v10.4s -mla v12.4S, v20.4S, v31.s[0] -mla v7.4S, v17.4S, v31.s[0] -sub v17.4s, v0.4s, v14.4s -mla v4.4S, v5.4S, v31.s[0] -mla v8.4S, v16.4S, v31.s[0] -add v0.4s, v0.4s, v14.4s -sqrdmulh v14.4S, v2.4S, v27.s[2] -mul v2.4S, v2.4S,v28.s[2] -sub v16.4s, v19.4s, v9.4s -sqrdmulh v5.4S, v13.4S, v27.s[2] -mul v13.4S, v13.4S,v28.s[2] -add v19.4s, v19.4s, v9.4s -sqrdmulh v9.4S, v21.4S, v27.s[3] -mul v21.4S, v21.4S,v28.s[3] -sub v20.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -sqrdmulh v12.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v10.4s, v3.4s, v7.4s -add v3.4s, v3.4s, v7.4s -mla v2.4S, v14.4S, v31.s[0] -mla v13.4S, v5.4S, v31.s[0] -sub v5.4s, v11.4s, v4.4s -mla v21.4S, v9.4S, v31.s[0] -mla v15.4S, v12.4S, v31.s[0] -add v11.4s, v11.4s, v4.4s -sqrdmulh v4.4S, v3.4S, v25.s[2] -mul v3.4S, v3.4S,v26.s[2] -sub v12.4s, v6.4s, v8.4s -sqrdmulh v9.4S, v10.4S, v25.s[3] -mul v10.4S, v10.4S,v26.s[3] -add v6.4s, v6.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v25.s[1] -mul v12.4S, v12.4S,v26.s[1] -sub v14.4s, v0.4s, v2.4s -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v6.4S, v25.s[0] -mul v6.4S, v6.4S,v26.s[0] -sub v7.4s, v19.4s, v13.4s -add v19.4s, v19.4s, v13.4s -mla v3.4S, v4.4S, v31.s[0] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v17.4s, v21.4s -mla v12.4S, v8.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v21.4s -sqrdmulh v21.4S, v19.4S, v23.s[0] -mul v19.4S, v19.4S,v24.s[0] -sub v2.4s, v16.4s, v15.4s -sqrdmulh v8.4S, v7.4S, v23.s[1] -mul v7.4S, v7.4S,v24.s[1] -add v16.4s, v16.4s, v15.4s -sqrdmulh v15.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v4.4s, v22.4s, v3.4s -add v22.4s, v22.4s, v3.4s -sqrdmulh v3.4S, v2.4S, v23.s[3] -mul v2.4S, v2.4S,v24.s[3] -sub v13.4s, v20.4s, v10.4s -add v20.4s, v20.4s, v10.4s -mla v19.4S, v21.4S, v31.s[0] -mla v7.4S, v8.4S, v31.s[0] -sub v8.4s, v5.4s, v12.4s -str q22, [x0, #272] -mla v16.4S, v15.4S, v31.s[0] -mla v2.4S, v3.4S, v31.s[0] -add v5.4s, v5.4s, v12.4s -str q4, [x0, #336] -sub v23.4s, v11.4s, v6.4s -str q20, [x0, #400] -add v11.4s, v11.4s, v6.4s -str q13, [x0, #464] -sub v13.4s, v0.4s, v19.4s -add v0.4s, v0.4s, v19.4s -sub v19.4s, v14.4s, v7.4s -add v14.4s, v14.4s, v7.4s -sub v7.4s, v17.4s, v16.4s -str q5, [x0, #144] -add v17.4s, v17.4s, v16.4s -str q8, [x0, #208] -sub v8.4s, v9.4s, v2.4s -str q11, [x0, #16] -add v9.4s, v9.4s, v2.4s -str q23, [x0, #80] -str q0, [x0, #528] -str q13, [x0, #592] -str q14, [x0, #656] -str q19, [x0, #720] -str q17, [x0, #784] -str q7, [x0, #848] -str q9, [x0, #912] -str q8, [x0, #976] -ldr q18, [x0, #224] -ldr q1, [x0, #160] -ldr q10, [x0, #32] -ldr q21, [x17, #+128] -ldr q22, [x17, #+144] -sqrdmulh v15.4S, v10.4S, v22.s[0] -mul v10.4S, v10.4S,v21.s[0] -ldr q3, [x0, #48] -sqrdmulh v12.4S, v3.4S, v22.s[0] -mul v3.4S, v3.4S,v21.s[0] -ldr q4, [x17, #+160] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v4.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v4.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v10.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v1.4S, v24.s[0] -ldr q20, [x0, #176] -mla v3.4S, v12.4S, v31.s[0] -sqrdmulh v12.4S, v20.4S, v24.s[0] -ldr q6, [x17, #+224] -ldr q5, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v18.4S, v5.s[0] -ldr q16, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v16.4S, v5.s[0] -ldr q11, [x0, #0] -ldr q2, [x0, #128] -mul v1.4S, v1.4S,v25.s[0] -sub v23.4s, v11.4s, v10.4s -ldr q0, [x0, #16] -mul v20.4S, v20.4S,v25.s[0] -add v11.4s, v11.4s, v10.4s -ldr q10, [x0, #144] -mla v1.4S, v15.4S, v31.s[0] -sub v15.4s, v0.4s, v3.4s -ldr q13, [x0, #64] -mla v20.4S, v12.4S, v31.s[0] -add v0.4s, v0.4s, v3.4s -ldr q3, [x0, #192] -mul v18.4S, v18.4S,v6.s[0] -sub v12.4s, v13.4s, v29.4s -ldr q14, [x0, #80] -mul v16.4S, v16.4S,v6.s[0] -add v13.4s, v13.4s, v29.4s -ldr q29, [x0, #208] -mla v18.4S, v28.4S, v31.s[0] -nop -mla v16.4S, v26.4S, v31.s[0] -sub v26.4s, v14.4s, v27.4s -sqrdmulh v28.4S, v0.4S, v22.s[1] -add v14.4s, v14.4s, v27.4s -mul v0.4S, v0.4S,v21.s[1] -nop -sqrdmulh v27.4S, v15.4S, v22.s[2] -sub v19.4s, v2.4s, v1.4s -mul v15.4S, v15.4S,v21.s[2] -add v2.4s, v2.4s, v1.4s -sqrdmulh v22.4S, v14.4S, v30.s[1] -sub v21.4s, v10.4s, v20.4s -mul v14.4S, v14.4S,v4.s[1] -add v10.4s, v10.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v30.s[2] -sub v1.4s, v3.4s, v18.4s -mul v26.4S, v26.4S,v4.s[2] -add v3.4s, v3.4s, v18.4s -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v16.4s -ldr q30, [x0, #480] -sqrdmulh v4.4S, v10.4S, v24.s[1] -add v29.4s, v29.4s, v16.4s -mla v15.4S, v27.4S, v31.s[0] -ldr q27, [x0, #416] -sqrdmulh v16.4S, v21.4S, v24.s[2] -sub v18.4s, v11.4s, v0.4s -mla v14.4S, v22.4S, v31.s[0] -ldr q22, [x0, #288] -sqrdmulh v17.4S, v29.4S, v5.s[1] -add v11.4s, v11.4s, v0.4s -str q18, [x0, #16] -mla v26.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -ldr q18, [x17, #+272] -sqrdmulh v0.4S, v28.4S, v5.s[2] -sub v7.4s, v23.4s, v15.4s -str q11, [x0, #0] -mul v10.4S, v10.4S,v25.s[1] -add v23.4s, v23.4s, v15.4s -mul v21.4S, v21.4S,v25.s[2] -str q7, [x0, #48] -mla v10.4S, v4.4S, v31.s[0] -sub v4.4s, v13.4s, v14.4s -mla v21.4S, v16.4S, v31.s[0] -str q23, [x0, #32] -mul v29.4S, v29.4S,v6.s[1] -str q4, [x0, #80] -mul v28.4S, v28.4S,v6.s[2] -add v13.4s, v13.4s, v14.4s -str q13, [x0, #64] -mla v29.4S, v17.4S, v31.s[0] -sub v17.4s, v12.4s, v26.4s -str q17, [x0, #112] -mla v28.4S, v0.4S, v31.s[0] -add v12.4s, v12.4s, v26.4s -str q12, [x0, #96] -sqrdmulh v5.4S, v22.4S, v18.s[0] -sub v6.4s, v2.4s, v10.4s -mul v22.4S, v22.4S,v20.s[0] -str q6, [x0, #144] -ldr q6, [x0, #304] -sqrdmulh v12.4S, v6.4S, v18.s[0] -add v2.4s, v2.4s, v10.4s -mul v6.4S, v6.4S,v20.s[0] -str q2, [x0, #128] -ldr q2, [x17, #+288] -ldr q10, [x17, #+304] -ldr q26, [x0, #352] -sqrdmulh v0.4S, v26.4S, v10.s[0] -sub v17.4s, v19.4s, v21.4s -mul v26.4S, v26.4S,v2.s[0] -str q17, [x0, #176] -ldr q17, [x0, #368] -sqrdmulh v13.4S, v17.4S, v10.s[0] -add v19.4s, v19.4s, v21.4s -mul v17.4S, v17.4S,v2.s[0] -str q19, [x0, #160] -ldr q19, [x17, #+320] -ldr q21, [x17, #+336] -mla v22.4S, v5.4S, v31.s[0] -sub v5.4s, v3.4s, v29.4s -sqrdmulh v14.4S, v27.4S, v21.s[0] -str q5, [x0, #208] -ldr q5, [x0, #432] -mla v6.4S, v12.4S, v31.s[0] -add v3.4s, v3.4s, v29.4s -sqrdmulh v29.4S, v5.4S, v21.s[0] -str q3, [x0, #192] -ldr q3, [x17, #+352] -ldr q12, [x17, #+368] -mla v26.4S, v0.4S, v31.s[0] -sub v0.4s, v1.4s, v28.4s -sqrdmulh v4.4S, v30.4S, v12.s[0] -str q0, [x0, #240] -ldr q0, [x0, #496] -mla v17.4S, v13.4S, v31.s[0] -add v1.4s, v1.4s, v28.4s -sqrdmulh v28.4S, v0.4S, v12.s[0] -str q1, [x0, #224] -ldr q1, [x0, #256] -ldr q13, [x0, #384] -mul v27.4S, v27.4S,v19.s[0] -sub v24.4s, v1.4s, v22.4s -ldr q25, [x0, #272] -mul v5.4S, v5.4S,v19.s[0] -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #400] -mla v27.4S, v14.4S, v31.s[0] -sub v14.4s, v25.4s, v6.4s -ldr q23, [x0, #320] -mla v5.4S, v29.4S, v31.s[0] -add v25.4s, v25.4s, v6.4s -ldr q6, [x0, #448] -mul v30.4S, v30.4S,v3.s[0] -sub v29.4s, v23.4s, v26.4s -ldr q16, [x0, #336] -mul v0.4S, v0.4S,v3.s[0] -add v23.4s, v23.4s, v26.4s -ldr q26, [x0, #464] -mla v30.4S, v4.4S, v31.s[0] -nop -mla v0.4S, v28.4S, v31.s[0] -sub v28.4s, v16.4s, v17.4s -sqrdmulh v4.4S, v25.4S, v18.s[1] -add v16.4s, v16.4s, v17.4s -mul v25.4S, v25.4S,v20.s[1] -nop -sqrdmulh v17.4S, v14.4S, v18.s[2] -sub v7.4s, v13.4s, v27.4s -mul v14.4S, v14.4S,v20.s[2] -add v13.4s, v13.4s, v27.4s -sqrdmulh v18.4S, v16.4S, v10.s[1] -sub v20.4s, v22.4s, v5.4s -mul v16.4S, v16.4S,v2.s[1] -add v22.4s, v22.4s, v5.4s -sqrdmulh v5.4S, v28.4S, v10.s[2] -sub v27.4s, v6.4s, v30.4s -mul v28.4S, v28.4S,v2.s[2] -add v6.4s, v6.4s, v30.4s -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v26.4s, v0.4s -ldr q10, [x0, #736] -sqrdmulh v2.4S, v22.4S, v21.s[1] -add v26.4s, v26.4s, v0.4s -mla v14.4S, v17.4S, v31.s[0] -ldr q17, [x0, #672] -sqrdmulh v0.4S, v20.4S, v21.s[2] -sub v30.4s, v1.4s, v25.4s -mla v16.4S, v18.4S, v31.s[0] -ldr q18, [x0, #544] -sqrdmulh v15.4S, v26.4S, v12.s[1] -add v1.4s, v1.4s, v25.4s -str q30, [x0, #272] -mla v28.4S, v5.4S, v31.s[0] -ldr q5, [x17, #+384] -ldr q30, [x17, #+400] -sqrdmulh v25.4S, v4.4S, v12.s[2] -sub v11.4s, v24.4s, v14.4s -str q1, [x0, #256] -mul v22.4S, v22.4S,v19.s[1] -add v24.4s, v24.4s, v14.4s -mul v20.4S, v20.4S,v19.s[2] -str q11, [x0, #304] -mla v22.4S, v2.4S, v31.s[0] -sub v2.4s, v23.4s, v16.4s -mla v20.4S, v0.4S, v31.s[0] -str q24, [x0, #288] -mul v26.4S, v26.4S,v3.s[1] -str q2, [x0, #336] -mul v4.4S, v4.4S,v3.s[2] -add v23.4s, v23.4s, v16.4s -str q23, [x0, #320] -mla v26.4S, v15.4S, v31.s[0] -sub v15.4s, v29.4s, v28.4s -str q15, [x0, #368] -mla v4.4S, v25.4S, v31.s[0] -add v29.4s, v29.4s, v28.4s -str q29, [x0, #352] -sqrdmulh v12.4S, v18.4S, v30.s[0] -sub v3.4s, v13.4s, v22.4s -mul v18.4S, v18.4S,v5.s[0] -str q3, [x0, #400] -ldr q3, [x0, #560] -sqrdmulh v29.4S, v3.4S, v30.s[0] -add v13.4s, v13.4s, v22.4s -mul v3.4S, v3.4S,v5.s[0] -str q13, [x0, #384] -ldr q13, [x17, #+416] -ldr q22, [x17, #+432] -ldr q28, [x0, #608] -sqrdmulh v25.4S, v28.4S, v22.s[0] -sub v15.4s, v7.4s, v20.4s -mul v28.4S, v28.4S,v13.s[0] -str q15, [x0, #432] -ldr q15, [x0, #624] -sqrdmulh v23.4S, v15.4S, v22.s[0] -add v7.4s, v7.4s, v20.4s -mul v15.4S, v15.4S,v13.s[0] -str q7, [x0, #416] -ldr q7, [x17, #+448] -ldr q20, [x17, #+464] -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v6.4s, v26.4s -sqrdmulh v16.4S, v17.4S, v20.s[0] -str q12, [x0, #464] -ldr q12, [x0, #688] -mla v3.4S, v29.4S, v31.s[0] -add v6.4s, v6.4s, v26.4s -sqrdmulh v26.4S, v12.4S, v20.s[0] -str q6, [x0, #448] -ldr q6, [x17, #+480] -ldr q29, [x17, #+496] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v4.4s -sqrdmulh v2.4S, v10.4S, v29.s[0] -str q25, [x0, #496] -ldr q25, [x0, #752] -mla v15.4S, v23.4S, v31.s[0] -add v27.4s, v27.4s, v4.4s -sqrdmulh v4.4S, v25.4S, v29.s[0] -str q27, [x0, #480] -ldr q27, [x0, #512] -ldr q23, [x0, #640] -mul v17.4S, v17.4S,v7.s[0] -sub v21.4s, v27.4s, v18.4s -ldr q19, [x0, #528] -mul v12.4S, v12.4S,v7.s[0] -add v27.4s, v27.4s, v18.4s -ldr q18, [x0, #656] -mla v17.4S, v16.4S, v31.s[0] -sub v16.4s, v19.4s, v3.4s -ldr q24, [x0, #576] -mla v12.4S, v26.4S, v31.s[0] -add v19.4s, v19.4s, v3.4s -ldr q3, [x0, #704] -mul v10.4S, v10.4S,v6.s[0] -sub v26.4s, v24.4s, v28.4s -ldr q0, [x0, #592] -mul v25.4S, v25.4S,v6.s[0] -add v24.4s, v24.4s, v28.4s -ldr q28, [x0, #720] -mla v10.4S, v2.4S, v31.s[0] -nop -mla v25.4S, v4.4S, v31.s[0] -sub v4.4s, v0.4s, v15.4s -sqrdmulh v2.4S, v19.4S, v30.s[1] -add v0.4s, v0.4s, v15.4s -mul v19.4S, v19.4S,v5.s[1] -nop -sqrdmulh v15.4S, v16.4S, v30.s[2] -sub v11.4s, v23.4s, v17.4s -mul v16.4S, v16.4S,v5.s[2] -add v23.4s, v23.4s, v17.4s -sqrdmulh v30.4S, v0.4S, v22.s[1] -sub v5.4s, v18.4s, v12.4s -mul v0.4S, v0.4S,v13.s[1] -add v18.4s, v18.4s, v12.4s -sqrdmulh v12.4S, v4.4S, v22.s[2] -sub v17.4s, v3.4s, v10.4s -mul v4.4S, v4.4S,v13.s[2] -add v3.4s, v3.4s, v10.4s -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v28.4s, v25.4s -ldr q22, [x0, #992] -sqrdmulh v13.4S, v18.4S, v20.s[1] -add v28.4s, v28.4s, v25.4s -mla v16.4S, v15.4S, v31.s[0] -ldr q15, [x0, #928] -sqrdmulh v25.4S, v5.4S, v20.s[2] -sub v10.4s, v27.4s, v19.4s -mla v0.4S, v30.4S, v31.s[0] -ldr q30, [x0, #800] -sqrdmulh v14.4S, v28.4S, v29.s[1] -add v27.4s, v27.4s, v19.4s -str q10, [x0, #528] -mla v4.4S, v12.4S, v31.s[0] -ldr q12, [x17, #+512] -ldr q10, [x17, #+528] -sqrdmulh v19.4S, v2.4S, v29.s[2] -sub v1.4s, v21.4s, v16.4s -str q27, [x0, #512] -mul v18.4S, v18.4S,v7.s[1] -add v21.4s, v21.4s, v16.4s -mul v5.4S, v5.4S,v7.s[2] -str q1, [x0, #560] -mla v18.4S, v13.4S, v31.s[0] -sub v13.4s, v24.4s, v0.4s -mla v5.4S, v25.4S, v31.s[0] -str q21, [x0, #544] -mul v28.4S, v28.4S,v6.s[1] -str q13, [x0, #592] -mul v2.4S, v2.4S,v6.s[2] -add v24.4s, v24.4s, v0.4s -str q24, [x0, #576] -mla v28.4S, v14.4S, v31.s[0] -sub v14.4s, v26.4s, v4.4s -str q14, [x0, #624] -mla v2.4S, v19.4S, v31.s[0] -add v26.4s, v26.4s, v4.4s -str q26, [x0, #608] -sqrdmulh v29.4S, v30.4S, v10.s[0] -sub v6.4s, v23.4s, v18.4s -mul v30.4S, v30.4S,v12.s[0] -str q6, [x0, #656] -ldr q6, [x0, #816] -sqrdmulh v26.4S, v6.4S, v10.s[0] -add v23.4s, v23.4s, v18.4s -mul v6.4S, v6.4S,v12.s[0] -str q23, [x0, #640] -ldr q23, [x17, #+544] -ldr q18, [x17, #+560] -ldr q4, [x0, #864] -sqrdmulh v19.4S, v4.4S, v18.s[0] -sub v14.4s, v11.4s, v5.4s -mul v4.4S, v4.4S,v23.s[0] -str q14, [x0, #688] -ldr q14, [x0, #880] -sqrdmulh v24.4S, v14.4S, v18.s[0] -add v11.4s, v11.4s, v5.4s -mul v14.4S, v14.4S,v23.s[0] -str q11, [x0, #672] -ldr q11, [x17, #+576] -ldr q5, [x17, #+592] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v3.4s, v28.4s -sqrdmulh v0.4S, v15.4S, v5.s[0] -str q29, [x0, #720] -ldr q29, [x0, #944] -mla v6.4S, v26.4S, v31.s[0] -add v3.4s, v3.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v5.s[0] -str q3, [x0, #704] -ldr q3, [x17, #+608] -ldr q26, [x17, #+624] -mla v4.4S, v19.4S, v31.s[0] -sub v19.4s, v17.4s, v2.4s -sqrdmulh v13.4S, v22.4S, v26.s[0] -str q19, [x0, #752] -ldr q19, [x0, #1008] -mla v14.4S, v24.4S, v31.s[0] -add v17.4s, v17.4s, v2.4s -sqrdmulh v2.4S, v19.4S, v26.s[0] -str q17, [x0, #736] -ldr q17, [x0, #768] -ldr q24, [x0, #896] -mul v15.4S, v15.4S,v11.s[0] -sub v20.4s, v17.4s, v30.4s -ldr q7, [x0, #784] -mul v29.4S, v29.4S,v11.s[0] -add v17.4s, v17.4s, v30.4s -ldr q30, [x0, #912] -mla v15.4S, v0.4S, v31.s[0] -sub v0.4s, v7.4s, v6.4s -ldr q21, [x0, #832] -mla v29.4S, v28.4S, v31.s[0] -add v7.4s, v7.4s, v6.4s -ldr q6, [x0, #960] -mul v22.4S, v22.4S,v3.s[0] -sub v28.4s, v21.4s, v4.4s -ldr q25, [x0, #848] -mul v19.4S, v19.4S,v3.s[0] -add v21.4s, v21.4s, v4.4s -ldr q4, [x0, #976] -mla v22.4S, v13.4S, v31.s[0] -nop -mla v19.4S, v2.4S, v31.s[0] -sub v2.4s, v25.4s, v14.4s -sqrdmulh v13.4S, v7.4S, v10.s[1] -add v25.4s, v25.4s, v14.4s -mul v7.4S, v7.4S,v12.s[1] -nop -sqrdmulh v14.4S, v0.4S, v10.s[2] -sub v1.4s, v24.4s, v15.4s -mul v0.4S, v0.4S,v12.s[2] -add v24.4s, v24.4s, v15.4s -sqrdmulh v10.4S, v25.4S, v18.s[1] -sub v12.4s, v30.4s, v29.4s -mul v25.4S, v25.4S,v23.s[1] -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v2.4S, v18.s[2] -sub v15.4s, v6.4s, v22.4s -mul v2.4S, v2.4S,v23.s[2] -add v6.4s, v6.4s, v22.4s -mla v7.4S, v13.4S, v31.s[0] -sub v13.4s, v4.4s, v19.4s -sqrdmulh v18.4S, v30.4S, v5.s[1] -add v4.4s, v4.4s, v19.4s -mla v0.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v12.4S, v5.s[2] -sub v19.4s, v17.4s, v7.4s -mla v25.4S, v10.4S, v31.s[0] -sqrdmulh v10.4S, v4.4S, v26.s[1] -add v17.4s, v17.4s, v7.4s -str q19, [x0, #784] -mla v2.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v13.4S, v26.s[2] -sub v19.4s, v20.4s, v0.4s -str q17, [x0, #768] -mul v30.4S, v30.4S,v11.s[1] -add v20.4s, v20.4s, v0.4s -mul v12.4S, v12.4S,v11.s[2] -str q19, [x0, #816] -mla v30.4S, v18.4S, v31.s[0] -sub v18.4s, v21.4s, v25.4s -mla v12.4S, v14.4S, v31.s[0] -str q20, [x0, #800] -mul v4.4S, v4.4S,v3.s[1] -str q18, [x0, #848] -mul v13.4S, v13.4S,v3.s[2] -add v21.4s, v21.4s, v25.4s -str q21, [x0, #832] -mla v4.4S, v10.4S, v31.s[0] -sub v10.4s, v28.4s, v2.4s -str q10, [x0, #880] -mla v13.4S, v29.4S, v31.s[0] -add v28.4s, v28.4s, v2.4s -str q28, [x0, #864] -sub v26.4s, v24.4s, v30.4s -str q26, [x0, #912] -add v24.4s, v24.4s, v30.4s -str q24, [x0, #896] -sub v24.4s, v1.4s, v12.4s -str q24, [x0, #944] -add v1.4s, v1.4s, v12.4s -str q1, [x0, #928] -sub v1.4s, v6.4s, v4.4s -str q1, [x0, #976] -add v6.4s, v6.4s, v4.4s -str q6, [x0, #960] -sub v6.4s, v15.4s, v13.4s -str q6, [x0, #1008] -add v15.4s, v15.4s, v13.4s -str q15, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1472 -// Instruction count: 1468 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s deleted file mode 100644 index 5cdf1c8..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_8_z4_7.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_8_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -sub v15.4s, v1.4s, v22.4s -mul v2.4S, v2.4S,v30.s[0] -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -sub v12.4s, v14.4s, v20.4s -mul v22.4S, v22.4S,v30.s[0] -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -sub v17.4s, v20.4s, v18.4s -mla v19.4S, v21.4S, v31.s[0] -mla v2.4S, v0.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -mla v22.4S, v13.4S, v31.s[0] -ldr q13, [x0, #352] -sqrdmulh v18.4S, v1.4S, v29.s[1] -sub v0.4s, v13.4s, v16.4s -mul v1.4S, v1.4S,v30.s[1] -sqrdmulh v21.4S, v14.4S, v29.s[1] -add v13.4s, v13.4s, v16.4s -mul v14.4S, v14.4S,v30.s[1] -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -sub v10.4s, v16.4s, v3.4s -mul v20.4S, v20.4S,v30.s[1] -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v13.4S, v29.s[1] -sub v8.4s, v3.4s, v19.4s -mul v13.4S, v13.4S,v30.s[1] -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v18.4S, v31.s[0] -sub v18.4s, v19.4s, v2.4s -mla v14.4S, v21.4S, v31.s[0] -mla v20.4S, v11.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -mla v13.4S, v9.4S, v31.s[0] -ldr q9, [x0, #224] -sqrdmulh v2.4S, v15.4S, v29.s[2] -sub v11.4s, v9.4s, v22.4s -mul v15.4S, v15.4S,v30.s[2] -sqrdmulh v21.4S, v12.4S, v29.s[2] -add v9.4s, v9.4s, v22.4s -mul v12.4S, v12.4S,v30.s[2] -sqrdmulh v22.4S, v17.4S, v29.s[2] -sub v7.4s, v19.4s, v1.4s -mul v17.4S, v17.4S,v30.s[2] -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -sub v6.4s, v9.4s, v14.4s -mul v0.4S, v0.4S,v30.s[2] -add v9.4s, v9.4s, v14.4s -mla v15.4S, v2.4S, v31.s[0] -sub v2.4s, v16.4s, v20.4s -mla v12.4S, v21.4S, v31.s[0] -mla v17.4S, v22.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -mla v0.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v7.4S, v27.s[1] -sub v20.4s, v3.4s, v13.4s -mul v7.4S, v7.4S,v28.s[1] -sqrdmulh v22.4S, v6.4S, v27.s[1] -add v3.4s, v3.4s, v13.4s -mul v6.4S, v6.4S,v28.s[1] -sqrdmulh v13.4S, v19.4S, v27.s[0] -sub v21.4s, v18.4s, v15.4s -mul v19.4S, v19.4S,v28.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v9.4S, v27.s[0] -sub v14.4s, v11.4s, v12.4s -mul v9.4S, v9.4S,v28.s[0] -add v11.4s, v11.4s, v12.4s -mla v7.4S, v1.4S, v31.s[0] -sub v1.4s, v10.4s, v17.4s -mla v6.4S, v22.4S, v31.s[0] -mla v19.4S, v13.4S, v31.s[0] -add v10.4s, v10.4s, v17.4s -mla v9.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v18.4S, v27.s[2] -sub v17.4s, v8.4s, v0.4s -mul v18.4S, v18.4S,v28.s[2] -sqrdmulh v13.4S, v11.4S, v27.s[2] -add v8.4s, v8.4s, v0.4s -mul v11.4S, v11.4S,v28.s[2] -sqrdmulh v0.4S, v21.4S, v27.s[3] -sub v22.4s, v2.4s, v7.4s -mul v21.4S, v21.4S,v28.s[3] -add v2.4s, v2.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -sub v12.4s, v20.4s, v6.4s -mul v14.4S, v14.4S,v28.s[3] -add v20.4s, v20.4s, v6.4s -mla v18.4S, v15.4S, v31.s[0] -sub v15.4s, v16.4s, v19.4s -mla v11.4S, v13.4S, v31.s[0] -mla v21.4S, v0.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -mla v14.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v20.4S, v25.s[2] -sub v19.4s, v3.4s, v9.4s -mul v20.4S, v20.4S,v26.s[2] -sqrdmulh v0.4S, v12.4S, v25.s[3] -add v3.4s, v3.4s, v9.4s -mul v12.4S, v12.4S,v26.s[3] -sqrdmulh v9.4S, v19.4S, v25.s[1] -sub v13.4s, v10.4s, v18.4s -mul v19.4S, v19.4S,v26.s[1] -add v10.4s, v10.4s, v18.4s -sqrdmulh v18.4S, v3.4S, v25.s[0] -sub v6.4s, v8.4s, v11.4s -mul v3.4S, v3.4S,v26.s[0] -add v8.4s, v8.4s, v11.4s -mla v20.4S, v7.4S, v31.s[0] -sub v7.4s, v1.4s, v21.4s -mla v12.4S, v0.4S, v31.s[0] -mla v19.4S, v9.4S, v31.s[0] -add v1.4s, v1.4s, v21.4s -mla v3.4S, v18.4S, v31.s[0] -sqrdmulh v18.4S, v8.4S, v23.s[0] -sub v21.4s, v17.4s, v14.4s -mul v8.4S, v8.4S,v24.s[0] -sqrdmulh v9.4S, v6.4S, v23.s[1] -add v17.4s, v17.4s, v14.4s -mul v6.4S, v6.4S,v24.s[1] -sqrdmulh v14.4S, v17.4S, v23.s[2] -sub v0.4s, v2.4s, v20.4s -mul v17.4S, v17.4S,v24.s[2] -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v23.s[3] -sub v11.4s, v22.4s, v12.4s -mul v21.4S, v21.4S,v24.s[3] -add v22.4s, v22.4s, v12.4s -mla v8.4S, v18.4S, v31.s[0] -sub v18.4s, v15.4s, v19.4s -mla v6.4S, v9.4S, v31.s[0] -str q2, [x0, #288] -mla v17.4S, v14.4S, v31.s[0] -add v15.4s, v15.4s, v19.4s -mla v21.4S, v20.4S, v31.s[0] -str q0, [x0, #352] -ldr q0, [x0, #944] -sqrdmulh v20.4S, v0.4S, v29.s[0] -sub v19.4s, v16.4s, v3.4s -mul v0.4S, v0.4S,v30.s[0] -str q22, [x0, #416] -ldr q22, [x0, #1008] -sqrdmulh v14.4S, v22.4S, v29.s[0] -add v16.4s, v16.4s, v3.4s -mul v22.4S, v22.4S,v30.s[0] -str q11, [x0, #480] -ldr q11, [x0, #816] -sqrdmulh v3.4S, v11.4S, v29.s[0] -sub v2.4s, v10.4s, v8.4s -mul v11.4S, v11.4S,v30.s[0] -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -sub v12.4s, v13.4s, v6.4s -mul v8.4S, v8.4S,v30.s[0] -add v13.4s, v13.4s, v6.4s -mla v0.4S, v20.4S, v31.s[0] -sub v20.4s, v1.4s, v17.4s -mla v22.4S, v14.4S, v31.s[0] -str q15, [x0, #160] -mla v11.4S, v3.4S, v31.s[0] -add v1.4s, v1.4s, v17.4s -mla v8.4S, v9.4S, v31.s[0] -str q18, [x0, #224] -ldr q18, [x0, #560] -sqrdmulh v9.4S, v18.4S, v29.s[0] -sub v17.4s, v7.4s, v21.4s -mul v18.4S, v18.4S,v30.s[0] -str q16, [x0, #32] -ldr q16, [x0, #624] -sqrdmulh v3.4S, v16.4S, v29.s[0] -add v7.4s, v7.4s, v21.4s -mul v16.4S, v16.4S,v30.s[0] -str q19, [x0, #96] -ldr q19, [x0, #688] -ldr q21, [x0, #432] -sqrdmulh v15.4S, v19.4S, v29.s[0] -sub v14.4s, v21.4s, v0.4s -mul v19.4S, v19.4S,v30.s[0] -add v21.4s, v21.4s, v0.4s -ldr q0, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v0.4S, v29.s[0] -sub v4.4s, v6.4s, v22.4s -mul v0.4S, v0.4S,v30.s[0] -add v6.4s, v6.4s, v22.4s -ldr q22, [x0, #304] -mla v18.4S, v9.4S, v31.s[0] -sub v9.4s, v22.4s, v11.4s -mla v16.4S, v3.4S, v31.s[0] -str q10, [x0, #544] -mla v19.4S, v15.4S, v31.s[0] -add v22.4s, v22.4s, v11.4s -mla v0.4S, v5.4S, v31.s[0] -str q2, [x0, #608] -ldr q2, [x0, #368] -sqrdmulh v5.4S, v21.4S, v29.s[1] -sub v11.4s, v2.4s, v8.4s -mul v21.4S, v21.4S,v30.s[1] -str q13, [x0, #672] -sqrdmulh v13.4S, v6.4S, v29.s[1] -add v2.4s, v2.4s, v8.4s -mul v6.4S, v6.4S,v30.s[1] -str q12, [x0, #736] -ldr q12, [x0, #48] -sqrdmulh v8.4S, v22.4S, v29.s[1] -sub v15.4s, v12.4s, v18.4s -mul v22.4S, v22.4S,v30.s[1] -add v12.4s, v12.4s, v18.4s -ldr q18, [x0, #112] -sqrdmulh v10.4S, v2.4S, v29.s[1] -sub v3.4s, v18.4s, v16.4s -mul v2.4S, v2.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #176] -mla v21.4S, v5.4S, v31.s[0] -sub v5.4s, v16.4s, v19.4s -mla v6.4S, v13.4S, v31.s[0] -str q1, [x0, #800] -mla v22.4S, v8.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -mla v2.4S, v10.4S, v31.s[0] -str q20, [x0, #864] -ldr q20, [x0, #240] -sqrdmulh v10.4S, v14.4S, v29.s[2] -sub v19.4s, v20.4s, v0.4s -mul v14.4S, v14.4S,v30.s[2] -str q7, [x0, #928] -sqrdmulh v7.4S, v4.4S, v29.s[2] -add v20.4s, v20.4s, v0.4s -mul v4.4S, v4.4S,v30.s[2] -str q17, [x0, #992] -sqrdmulh v17.4S, v9.4S, v29.s[2] -sub v0.4s, v16.4s, v21.4s -mul v9.4S, v9.4S,v30.s[2] -add v16.4s, v16.4s, v21.4s -sqrdmulh v21.4S, v11.4S, v29.s[2] -sub v8.4s, v20.4s, v6.4s -mul v11.4S, v11.4S,v30.s[2] -add v20.4s, v20.4s, v6.4s -mla v14.4S, v10.4S, v31.s[0] -sub v10.4s, v12.4s, v22.4s -mla v4.4S, v7.4S, v31.s[0] -mla v9.4S, v17.4S, v31.s[0] -add v12.4s, v12.4s, v22.4s -mla v11.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v0.4S, v27.s[1] -sub v22.4s, v18.4s, v2.4s -mul v0.4S, v0.4S,v28.s[1] -sqrdmulh v17.4S, v8.4S, v27.s[1] -add v18.4s, v18.4s, v2.4s -mul v8.4S, v8.4S,v28.s[1] -sqrdmulh v2.4S, v16.4S, v27.s[0] -sub v7.4s, v5.4s, v14.4s -mul v16.4S, v16.4S,v28.s[0] -add v5.4s, v5.4s, v14.4s -sqrdmulh v14.4S, v20.4S, v27.s[0] -sub v6.4s, v19.4s, v4.4s -mul v20.4S, v20.4S,v28.s[0] -add v19.4s, v19.4s, v4.4s -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v15.4s, v9.4s -mla v8.4S, v17.4S, v31.s[0] -mla v16.4S, v2.4S, v31.s[0] -add v15.4s, v15.4s, v9.4s -mla v20.4S, v14.4S, v31.s[0] -sqrdmulh v14.4S, v5.4S, v27.s[2] -sub v9.4s, v3.4s, v11.4s -mul v5.4S, v5.4S,v28.s[2] -sqrdmulh v2.4S, v19.4S, v27.s[2] -add v3.4s, v3.4s, v11.4s -mul v19.4S, v19.4S,v28.s[2] -sqrdmulh v11.4S, v7.4S, v27.s[3] -sub v17.4s, v10.4s, v0.4s -mul v7.4S, v7.4S,v28.s[3] -add v10.4s, v10.4s, v0.4s -sqrdmulh v0.4S, v6.4S, v27.s[3] -sub v4.4s, v22.4s, v8.4s -mul v6.4S, v6.4S,v28.s[3] -add v22.4s, v22.4s, v8.4s -mla v5.4S, v14.4S, v31.s[0] -sub v14.4s, v12.4s, v16.4s -mla v19.4S, v2.4S, v31.s[0] -mla v7.4S, v11.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -mla v6.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v22.4S, v25.s[2] -sub v16.4s, v18.4s, v20.4s -mul v22.4S, v22.4S,v26.s[2] -sqrdmulh v11.4S, v4.4S, v25.s[3] -add v18.4s, v18.4s, v20.4s -mul v4.4S, v4.4S,v26.s[3] -sqrdmulh v20.4S, v16.4S, v25.s[1] -sub v2.4s, v15.4s, v5.4s -mul v16.4S, v16.4S,v26.s[1] -add v15.4s, v15.4s, v5.4s -sqrdmulh v5.4S, v18.4S, v25.s[0] -sub v8.4s, v3.4s, v19.4s -mul v18.4S, v18.4S,v26.s[0] -add v3.4s, v3.4s, v19.4s -mla v22.4S, v0.4S, v31.s[0] -sub v0.4s, v21.4s, v7.4s -mla v4.4S, v11.4S, v31.s[0] -mla v16.4S, v20.4S, v31.s[0] -add v21.4s, v21.4s, v7.4s -mla v18.4S, v5.4S, v31.s[0] -sqrdmulh v5.4S, v3.4S, v23.s[0] -sub v7.4s, v9.4s, v6.4s -mul v3.4S, v3.4S,v24.s[0] -sqrdmulh v20.4S, v8.4S, v23.s[1] -add v9.4s, v9.4s, v6.4s -mul v8.4S, v8.4S,v24.s[1] -sqrdmulh v6.4S, v9.4S, v23.s[2] -sub v11.4s, v10.4s, v22.4s -mul v9.4S, v9.4S,v24.s[2] -add v10.4s, v10.4s, v22.4s -sqrdmulh v22.4S, v7.4S, v23.s[3] -sub v19.4s, v17.4s, v4.4s -mul v7.4S, v7.4S,v24.s[3] -add v17.4s, v17.4s, v4.4s -mla v3.4S, v5.4S, v31.s[0] -sub v5.4s, v14.4s, v16.4s -mla v8.4S, v20.4S, v31.s[0] -str q10, [x0, #304] -mla v9.4S, v6.4S, v31.s[0] -add v14.4s, v14.4s, v16.4s -mla v7.4S, v22.4S, v31.s[0] -str q11, [x0, #368] -ldr q11, [x0, #896] -sqrdmulh v22.4S, v11.4S, v29.s[0] -sub v16.4s, v12.4s, v18.4s -mul v11.4S, v11.4S,v30.s[0] -str q17, [x0, #432] -ldr q17, [x0, #960] -sqrdmulh v6.4S, v17.4S, v29.s[0] -add v12.4s, v12.4s, v18.4s -mul v17.4S, v17.4S,v30.s[0] -str q19, [x0, #496] -ldr q19, [x0, #768] -sqrdmulh v18.4S, v19.4S, v29.s[0] -sub v10.4s, v15.4s, v3.4s -mul v19.4S, v19.4S,v30.s[0] -add v15.4s, v15.4s, v3.4s -ldr q3, [x0, #832] -sqrdmulh v20.4S, v3.4S, v29.s[0] -sub v4.4s, v2.4s, v8.4s -mul v3.4S, v3.4S,v30.s[0] -add v2.4s, v2.4s, v8.4s -mla v11.4S, v22.4S, v31.s[0] -sub v22.4s, v21.4s, v9.4s -mla v17.4S, v6.4S, v31.s[0] -str q14, [x0, #176] -mla v19.4S, v18.4S, v31.s[0] -add v21.4s, v21.4s, v9.4s -mla v3.4S, v20.4S, v31.s[0] -str q5, [x0, #240] -ldr q5, [x0, #512] -sqrdmulh v20.4S, v5.4S, v29.s[0] -sub v9.4s, v0.4s, v7.4s -mul v5.4S, v5.4S,v30.s[0] -str q12, [x0, #48] -ldr q12, [x0, #576] -sqrdmulh v18.4S, v12.4S, v29.s[0] -add v0.4s, v0.4s, v7.4s -mul v12.4S, v12.4S,v30.s[0] -str q16, [x0, #112] -ldr q16, [x0, #640] -ldr q7, [x0, #384] -sqrdmulh v14.4S, v16.4S, v29.s[0] -sub v6.4s, v7.4s, v11.4s -mul v16.4S, v16.4S,v30.s[0] -add v7.4s, v7.4s, v11.4s -ldr q11, [x0, #704] -ldr q8, [x0, #448] -sqrdmulh v1.4S, v11.4S, v29.s[0] -sub v13.4s, v8.4s, v17.4s -mul v11.4S, v11.4S,v30.s[0] -add v8.4s, v8.4s, v17.4s -ldr q17, [x0, #256] -mla v5.4S, v20.4S, v31.s[0] -sub v20.4s, v17.4s, v19.4s -mla v12.4S, v18.4S, v31.s[0] -str q15, [x0, #560] -mla v16.4S, v14.4S, v31.s[0] -add v17.4s, v17.4s, v19.4s -mla v11.4S, v1.4S, v31.s[0] -str q10, [x0, #624] -ldr q10, [x0, #320] -sqrdmulh v1.4S, v7.4S, v29.s[1] -sub v19.4s, v10.4s, v3.4s -mul v7.4S, v7.4S,v30.s[1] -str q2, [x0, #688] -sqrdmulh v2.4S, v8.4S, v29.s[1] -add v10.4s, v10.4s, v3.4s -mul v8.4S, v8.4S,v30.s[1] -str q4, [x0, #752] -ldr q4, [x0, #0] -sqrdmulh v3.4S, v17.4S, v29.s[1] -sub v14.4s, v4.4s, v5.4s -mul v17.4S, v17.4S,v30.s[1] -add v4.4s, v4.4s, v5.4s -ldr q5, [x0, #64] -sqrdmulh v15.4S, v10.4S, v29.s[1] -sub v18.4s, v5.4s, v12.4s -mul v10.4S, v10.4S,v30.s[1] -add v5.4s, v5.4s, v12.4s -ldr q12, [x0, #128] -mla v7.4S, v1.4S, v31.s[0] -sub v1.4s, v12.4s, v16.4s -mla v8.4S, v2.4S, v31.s[0] -str q21, [x0, #816] -mla v17.4S, v3.4S, v31.s[0] -add v12.4s, v12.4s, v16.4s -mla v10.4S, v15.4S, v31.s[0] -str q22, [x0, #880] -ldr q22, [x0, #192] -sqrdmulh v15.4S, v6.4S, v29.s[2] -sub v16.4s, v22.4s, v11.4s -mul v6.4S, v6.4S,v30.s[2] -str q0, [x0, #944] -sqrdmulh v0.4S, v13.4S, v29.s[2] -add v22.4s, v22.4s, v11.4s -mul v13.4S, v13.4S,v30.s[2] -str q9, [x0, #1008] -sqrdmulh v9.4S, v20.4S, v29.s[2] -sub v11.4s, v12.4s, v7.4s -mul v20.4S, v20.4S,v30.s[2] -add v12.4s, v12.4s, v7.4s -sqrdmulh v7.4S, v19.4S, v29.s[2] -sub v3.4s, v22.4s, v8.4s -mul v19.4S, v19.4S,v30.s[2] -add v22.4s, v22.4s, v8.4s -mla v6.4S, v15.4S, v31.s[0] -sub v15.4s, v4.4s, v17.4s -mla v13.4S, v0.4S, v31.s[0] -mla v20.4S, v9.4S, v31.s[0] -add v4.4s, v4.4s, v17.4s -mla v19.4S, v7.4S, v31.s[0] -sqrdmulh v7.4S, v11.4S, v27.s[1] -sub v17.4s, v5.4s, v10.4s -mul v11.4S, v11.4S,v28.s[1] -sqrdmulh v9.4S, v3.4S, v27.s[1] -add v5.4s, v5.4s, v10.4s -mul v3.4S, v3.4S,v28.s[1] -sqrdmulh v10.4S, v12.4S, v27.s[0] -sub v0.4s, v1.4s, v6.4s -mul v12.4S, v12.4S,v28.s[0] -add v1.4s, v1.4s, v6.4s -sqrdmulh v6.4S, v22.4S, v27.s[0] -sub v8.4s, v16.4s, v13.4s -mul v22.4S, v22.4S,v28.s[0] -add v16.4s, v16.4s, v13.4s -mla v11.4S, v7.4S, v31.s[0] -sub v7.4s, v14.4s, v20.4s -mla v3.4S, v9.4S, v31.s[0] -mla v12.4S, v10.4S, v31.s[0] -add v14.4s, v14.4s, v20.4s -mla v22.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v1.4S, v27.s[2] -sub v20.4s, v18.4s, v19.4s -mul v1.4S, v1.4S,v28.s[2] -sqrdmulh v10.4S, v16.4S, v27.s[2] -add v18.4s, v18.4s, v19.4s -mul v16.4S, v16.4S,v28.s[2] -sqrdmulh v19.4S, v0.4S, v27.s[3] -sub v9.4s, v15.4s, v11.4s -mul v0.4S, v0.4S,v28.s[3] -add v15.4s, v15.4s, v11.4s -sqrdmulh v11.4S, v8.4S, v27.s[3] -sub v13.4s, v17.4s, v3.4s -mul v8.4S, v8.4S,v28.s[3] -add v17.4s, v17.4s, v3.4s -mla v1.4S, v6.4S, v31.s[0] -sub v6.4s, v4.4s, v12.4s -mla v16.4S, v10.4S, v31.s[0] -mla v0.4S, v19.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -mla v8.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v17.4S, v25.s[2] -sub v12.4s, v5.4s, v22.4s -mul v17.4S, v17.4S,v26.s[2] -sqrdmulh v19.4S, v13.4S, v25.s[3] -add v5.4s, v5.4s, v22.4s -mul v13.4S, v13.4S,v26.s[3] -sqrdmulh v22.4S, v12.4S, v25.s[1] -sub v10.4s, v14.4s, v1.4s -mul v12.4S, v12.4S,v26.s[1] -add v14.4s, v14.4s, v1.4s -sqrdmulh v1.4S, v5.4S, v25.s[0] -sub v3.4s, v18.4s, v16.4s -mul v5.4S, v5.4S,v26.s[0] -add v18.4s, v18.4s, v16.4s -mla v17.4S, v11.4S, v31.s[0] -sub v11.4s, v7.4s, v0.4s -mla v13.4S, v19.4S, v31.s[0] -mla v12.4S, v22.4S, v31.s[0] -add v7.4s, v7.4s, v0.4s -mla v5.4S, v1.4S, v31.s[0] -sqrdmulh v1.4S, v18.4S, v23.s[0] -sub v0.4s, v20.4s, v8.4s -mul v18.4S, v18.4S,v24.s[0] -sqrdmulh v22.4S, v3.4S, v23.s[1] -add v20.4s, v20.4s, v8.4s -mul v3.4S, v3.4S,v24.s[1] -sqrdmulh v8.4S, v20.4S, v23.s[2] -sub v19.4s, v15.4s, v17.4s -mul v20.4S, v20.4S,v24.s[2] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v0.4S, v23.s[3] -sub v16.4s, v9.4s, v13.4s -mul v0.4S, v0.4S,v24.s[3] -add v9.4s, v9.4s, v13.4s -mla v18.4S, v1.4S, v31.s[0] -sub v1.4s, v6.4s, v12.4s -mla v3.4S, v22.4S, v31.s[0] -str q15, [x0, #256] -mla v20.4S, v8.4S, v31.s[0] -add v6.4s, v6.4s, v12.4s -mla v0.4S, v17.4S, v31.s[0] -str q19, [x0, #320] -ldr q19, [x0, #912] -sqrdmulh v17.4S, v19.4S, v29.s[0] -sub v12.4s, v4.4s, v5.4s -mul v19.4S, v19.4S,v30.s[0] -str q9, [x0, #384] -ldr q9, [x0, #976] -sqrdmulh v8.4S, v9.4S, v29.s[0] -add v4.4s, v4.4s, v5.4s -mul v9.4S, v9.4S,v30.s[0] -str q16, [x0, #448] -ldr q16, [x0, #784] -sqrdmulh v5.4S, v16.4S, v29.s[0] -sub v15.4s, v14.4s, v18.4s -mul v16.4S, v16.4S,v30.s[0] -add v14.4s, v14.4s, v18.4s -ldr q18, [x0, #848] -sqrdmulh v22.4S, v18.4S, v29.s[0] -sub v13.4s, v10.4s, v3.4s -mul v18.4S, v18.4S,v30.s[0] -add v10.4s, v10.4s, v3.4s -mla v19.4S, v17.4S, v31.s[0] -sub v17.4s, v7.4s, v20.4s -mla v9.4S, v8.4S, v31.s[0] -str q6, [x0, #128] -mla v16.4S, v5.4S, v31.s[0] -add v7.4s, v7.4s, v20.4s -mla v18.4S, v22.4S, v31.s[0] -str q1, [x0, #192] -ldr q1, [x0, #528] -sqrdmulh v22.4S, v1.4S, v29.s[0] -sub v20.4s, v11.4s, v0.4s -mul v1.4S, v1.4S,v30.s[0] -str q4, [x0, #0] -ldr q4, [x0, #592] -sqrdmulh v5.4S, v4.4S, v29.s[0] -add v11.4s, v11.4s, v0.4s -mul v4.4S, v4.4S,v30.s[0] -str q12, [x0, #64] -ldr q12, [x0, #656] -ldr q0, [x0, #400] -sqrdmulh v6.4S, v12.4S, v29.s[0] -sub v8.4s, v0.4s, v19.4s -mul v12.4S, v12.4S,v30.s[0] -add v0.4s, v0.4s, v19.4s -ldr q19, [x0, #720] -ldr q3, [x0, #464] -sqrdmulh v21.4S, v19.4S, v29.s[0] -sub v2.4s, v3.4s, v9.4s -mul v19.4S, v19.4S,v30.s[0] -add v3.4s, v3.4s, v9.4s -ldr q9, [x0, #272] -mla v1.4S, v22.4S, v31.s[0] -sub v22.4s, v9.4s, v16.4s -mla v4.4S, v5.4S, v31.s[0] -str q14, [x0, #512] -mla v12.4S, v6.4S, v31.s[0] -add v9.4s, v9.4s, v16.4s -mla v19.4S, v21.4S, v31.s[0] -str q15, [x0, #576] -ldr q15, [x0, #336] -sqrdmulh v21.4S, v0.4S, v29.s[1] -sub v16.4s, v15.4s, v18.4s -mul v0.4S, v0.4S,v30.s[1] -str q10, [x0, #640] -sqrdmulh v10.4S, v3.4S, v29.s[1] -add v15.4s, v15.4s, v18.4s -mul v3.4S, v3.4S,v30.s[1] -str q13, [x0, #704] -ldr q13, [x0, #16] -sqrdmulh v18.4S, v9.4S, v29.s[1] -sub v6.4s, v13.4s, v1.4s -mul v9.4S, v9.4S,v30.s[1] -add v13.4s, v13.4s, v1.4s -ldr q1, [x0, #80] -sqrdmulh v14.4S, v15.4S, v29.s[1] -sub v5.4s, v1.4s, v4.4s -mul v15.4S, v15.4S,v30.s[1] -add v1.4s, v1.4s, v4.4s -ldr q4, [x0, #144] -mla v0.4S, v21.4S, v31.s[0] -sub v21.4s, v4.4s, v12.4s -mla v3.4S, v10.4S, v31.s[0] -str q7, [x0, #768] -mla v9.4S, v18.4S, v31.s[0] -add v4.4s, v4.4s, v12.4s -mla v15.4S, v14.4S, v31.s[0] -str q17, [x0, #832] -ldr q17, [x0, #208] -sqrdmulh v14.4S, v8.4S, v29.s[2] -sub v12.4s, v17.4s, v19.4s -mul v8.4S, v8.4S,v30.s[2] -str q11, [x0, #896] -sqrdmulh v11.4S, v2.4S, v29.s[2] -add v17.4s, v17.4s, v19.4s -mul v2.4S, v2.4S,v30.s[2] -str q20, [x0, #960] -sqrdmulh v20.4S, v22.4S, v29.s[2] -sub v19.4s, v4.4s, v0.4s -mul v22.4S, v22.4S,v30.s[2] -add v4.4s, v4.4s, v0.4s -sqrdmulh v0.4S, v16.4S, v29.s[2] -sub v18.4s, v17.4s, v3.4s -mul v16.4S, v16.4S,v30.s[2] -add v17.4s, v17.4s, v3.4s -mla v8.4S, v14.4S, v31.s[0] -sub v14.4s, v13.4s, v9.4s -mla v2.4S, v11.4S, v31.s[0] -mla v22.4S, v20.4S, v31.s[0] -add v13.4s, v13.4s, v9.4s -mla v16.4S, v0.4S, v31.s[0] -sqrdmulh v0.4S, v19.4S, v27.s[1] -sub v9.4s, v1.4s, v15.4s -mul v19.4S, v19.4S,v28.s[1] -sqrdmulh v20.4S, v18.4S, v27.s[1] -add v1.4s, v1.4s, v15.4s -mul v18.4S, v18.4S,v28.s[1] -sqrdmulh v15.4S, v4.4S, v27.s[0] -sub v11.4s, v21.4s, v8.4s -mul v4.4S, v4.4S,v28.s[0] -add v21.4s, v21.4s, v8.4s -sqrdmulh v8.4S, v17.4S, v27.s[0] -sub v3.4s, v12.4s, v2.4s -mul v17.4S, v17.4S,v28.s[0] -add v12.4s, v12.4s, v2.4s -mla v19.4S, v0.4S, v31.s[0] -sub v0.4s, v6.4s, v22.4s -mla v18.4S, v20.4S, v31.s[0] -mla v4.4S, v15.4S, v31.s[0] -add v6.4s, v6.4s, v22.4s -mla v17.4S, v8.4S, v31.s[0] -sqrdmulh v8.4S, v21.4S, v27.s[2] -sub v22.4s, v5.4s, v16.4s -mul v21.4S, v21.4S,v28.s[2] -sqrdmulh v15.4S, v12.4S, v27.s[2] -add v5.4s, v5.4s, v16.4s -mul v12.4S, v12.4S,v28.s[2] -sqrdmulh v16.4S, v11.4S, v27.s[3] -sub v20.4s, v14.4s, v19.4s -mul v11.4S, v11.4S,v28.s[3] -add v14.4s, v14.4s, v19.4s -sqrdmulh v19.4S, v3.4S, v27.s[3] -sub v2.4s, v9.4s, v18.4s -mul v3.4S, v3.4S,v28.s[3] -add v9.4s, v9.4s, v18.4s -mla v21.4S, v8.4S, v31.s[0] -sub v8.4s, v13.4s, v4.4s -mla v12.4S, v15.4S, v31.s[0] -mla v11.4S, v16.4S, v31.s[0] -add v13.4s, v13.4s, v4.4s -mla v3.4S, v19.4S, v31.s[0] -sqrdmulh v19.4S, v9.4S, v25.s[2] -sub v4.4s, v1.4s, v17.4s -mul v9.4S, v9.4S,v26.s[2] -sqrdmulh v16.4S, v2.4S, v25.s[3] -add v1.4s, v1.4s, v17.4s -mul v2.4S, v2.4S,v26.s[3] -sqrdmulh v17.4S, v4.4S, v25.s[1] -sub v15.4s, v6.4s, v21.4s -mul v4.4S, v4.4S,v26.s[1] -add v6.4s, v6.4s, v21.4s -sqrdmulh v21.4S, v1.4S, v25.s[0] -sub v18.4s, v5.4s, v12.4s -mul v1.4S, v1.4S,v26.s[0] -add v5.4s, v5.4s, v12.4s -mla v9.4S, v19.4S, v31.s[0] -sub v19.4s, v0.4s, v11.4s -mla v2.4S, v16.4S, v31.s[0] -mla v4.4S, v17.4S, v31.s[0] -add v0.4s, v0.4s, v11.4s -mla v1.4S, v21.4S, v31.s[0] -sqrdmulh v21.4S, v5.4S, v23.s[0] -sub v11.4s, v22.4s, v3.4s -mul v5.4S, v5.4S,v24.s[0] -sqrdmulh v17.4S, v18.4S, v23.s[1] -add v22.4s, v22.4s, v3.4s -mul v18.4S, v18.4S,v24.s[1] -sqrdmulh v3.4S, v22.4S, v23.s[2] -sub v16.4s, v14.4s, v9.4s -mul v22.4S, v22.4S,v24.s[2] -add v14.4s, v14.4s, v9.4s -sqrdmulh v9.4S, v11.4S, v23.s[3] -sub v12.4s, v20.4s, v2.4s -mul v11.4S, v11.4S,v24.s[3] -add v20.4s, v20.4s, v2.4s -mla v5.4S, v21.4S, v31.s[0] -sub v21.4s, v8.4s, v4.4s -mla v18.4S, v17.4S, v31.s[0] -str q14, [x0, #272] -mla v22.4S, v3.4S, v31.s[0] -add v8.4s, v8.4s, v4.4s -mla v11.4S, v9.4S, v31.s[0] -str q16, [x0, #336] -sub v23.4s, v13.4s, v1.4s -str q20, [x0, #400] -add v13.4s, v13.4s, v1.4s -str q12, [x0, #464] -sub v12.4s, v6.4s, v5.4s -add v6.4s, v6.4s, v5.4s -sub v5.4s, v15.4s, v18.4s -add v15.4s, v15.4s, v18.4s -sub v18.4s, v0.4s, v22.4s -str q8, [x0, #144] -add v0.4s, v0.4s, v22.4s -str q21, [x0, #208] -sub v21.4s, v19.4s, v11.4s -str q13, [x0, #16] -add v19.4s, v19.4s, v11.4s -str q23, [x0, #80] -str q6, [x0, #528] -str q12, [x0, #592] -str q15, [x0, #656] -str q5, [x0, #720] -str q0, [x0, #784] -str q18, [x0, #848] -str q19, [x0, #912] -str q21, [x0, #976] -ldr q10, [x0, #224] -ldr q7, [x0, #160] -ldr q2, [x0, #32] -ldr q17, [x17, #+128] -ldr q14, [x17, #+144] -sqrdmulh v3.4S, v2.4S, v14.s[0] -mul v2.4S, v2.4S,v17.s[0] -ldr q4, [x0, #48] -sqrdmulh v9.4S, v4.4S, v14.s[0] -mul v4.4S, v4.4S,v17.s[0] -ldr q16, [x17, #+160] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v16.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v16.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v2.4S, v3.4S, v31.s[0] -sqrdmulh v3.4S, v7.4S, v24.s[0] -ldr q20, [x0, #176] -mla v4.4S, v9.4S, v31.s[0] -sqrdmulh v9.4S, v20.4S, v24.s[0] -ldr q1, [x17, #+224] -ldr q8, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v10.4S, v8.s[0] -ldr q22, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v22.4S, v8.s[0] -ldr q13, [x0, #0] -ldr q11, [x0, #128] -mul v7.4S, v7.4S,v25.s[0] -sub v23.4s, v13.4s, v2.4s -ldr q6, [x0, #16] -mul v20.4S, v20.4S,v25.s[0] -add v13.4s, v13.4s, v2.4s -ldr q2, [x0, #144] -mla v7.4S, v3.4S, v31.s[0] -sub v3.4s, v6.4s, v4.4s -ldr q12, [x0, #64] -mla v20.4S, v9.4S, v31.s[0] -add v6.4s, v6.4s, v4.4s -ldr q4, [x0, #192] -mul v10.4S, v10.4S,v1.s[0] -sub v9.4s, v12.4s, v29.4s -ldr q15, [x0, #80] -mul v22.4S, v22.4S,v1.s[0] -add v12.4s, v12.4s, v29.4s -ldr q29, [x0, #208] -mla v10.4S, v28.4S, v31.s[0] -mla v22.4S, v26.4S, v31.s[0] -sub v26.4s, v15.4s, v27.4s -sqrdmulh v28.4S, v6.4S, v14.s[1] -add v15.4s, v15.4s, v27.4s -mul v6.4S, v6.4S,v17.s[1] -sqrdmulh v27.4S, v3.4S, v14.s[2] -sub v5.4s, v11.4s, v7.4s -mul v3.4S, v3.4S,v17.s[2] -add v11.4s, v11.4s, v7.4s -sqrdmulh v14.4S, v15.4S, v30.s[1] -sub v17.4s, v2.4s, v20.4s -mul v15.4S, v15.4S,v16.s[1] -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v26.4S, v30.s[2] -sub v7.4s, v4.4s, v10.4s -mul v26.4S, v26.4S,v16.s[2] -add v4.4s, v4.4s, v10.4s -mla v6.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v22.4s -ldr q30, [x0, #480] -sqrdmulh v16.4S, v2.4S, v24.s[1] -add v29.4s, v29.4s, v22.4s -mla v3.4S, v27.4S, v31.s[0] -ldr q27, [x0, #416] -sqrdmulh v22.4S, v17.4S, v24.s[2] -sub v10.4s, v13.4s, v6.4s -mla v15.4S, v14.4S, v31.s[0] -ldr q14, [x0, #288] -sqrdmulh v0.4S, v29.4S, v8.s[1] -add v13.4s, v13.4s, v6.4s -str q10, [x0, #16] -mla v26.4S, v20.4S, v31.s[0] -ldr q20, [x17, #+256] -ldr q10, [x17, #+272] -sqrdmulh v6.4S, v28.4S, v8.s[2] -sub v18.4s, v23.4s, v3.4s -str q13, [x0, #0] -mul v2.4S, v2.4S,v25.s[1] -add v23.4s, v23.4s, v3.4s -mul v17.4S, v17.4S,v25.s[2] -str q18, [x0, #48] -mla v2.4S, v16.4S, v31.s[0] -sub v16.4s, v12.4s, v15.4s -mla v17.4S, v22.4S, v31.s[0] -str q23, [x0, #32] -mul v29.4S, v29.4S,v1.s[1] -str q16, [x0, #80] -mul v28.4S, v28.4S,v1.s[2] -add v12.4s, v12.4s, v15.4s -str q12, [x0, #64] -mla v29.4S, v0.4S, v31.s[0] -sub v0.4s, v9.4s, v26.4s -str q0, [x0, #112] -mla v28.4S, v6.4S, v31.s[0] -add v9.4s, v9.4s, v26.4s -str q9, [x0, #96] -sqrdmulh v8.4S, v14.4S, v10.s[0] -sub v1.4s, v11.4s, v2.4s -mul v14.4S, v14.4S,v20.s[0] -str q1, [x0, #144] -ldr q1, [x0, #304] -sqrdmulh v9.4S, v1.4S, v10.s[0] -add v11.4s, v11.4s, v2.4s -mul v1.4S, v1.4S,v20.s[0] -str q11, [x0, #128] -ldr q11, [x17, #+288] -ldr q2, [x17, #+304] -ldr q26, [x0, #352] -sqrdmulh v6.4S, v26.4S, v2.s[0] -sub v0.4s, v5.4s, v17.4s -mul v26.4S, v26.4S,v11.s[0] -str q0, [x0, #176] -ldr q0, [x0, #368] -sqrdmulh v12.4S, v0.4S, v2.s[0] -add v5.4s, v5.4s, v17.4s -mul v0.4S, v0.4S,v11.s[0] -str q5, [x0, #160] -ldr q5, [x17, #+320] -ldr q17, [x17, #+336] -mla v14.4S, v8.4S, v31.s[0] -sub v8.4s, v4.4s, v29.4s -sqrdmulh v15.4S, v27.4S, v17.s[0] -str q8, [x0, #208] -ldr q8, [x0, #432] -mla v1.4S, v9.4S, v31.s[0] -add v4.4s, v4.4s, v29.4s -sqrdmulh v29.4S, v8.4S, v17.s[0] -str q4, [x0, #192] -ldr q4, [x17, #+352] -ldr q9, [x17, #+368] -mla v26.4S, v6.4S, v31.s[0] -sub v6.4s, v7.4s, v28.4s -sqrdmulh v16.4S, v30.4S, v9.s[0] -str q6, [x0, #240] -ldr q6, [x0, #496] -mla v0.4S, v12.4S, v31.s[0] -add v7.4s, v7.4s, v28.4s -sqrdmulh v28.4S, v6.4S, v9.s[0] -str q7, [x0, #224] -ldr q7, [x0, #256] -ldr q12, [x0, #384] -mul v27.4S, v27.4S,v5.s[0] -sub v24.4s, v7.4s, v14.4s -ldr q25, [x0, #272] -mul v8.4S, v8.4S,v5.s[0] -add v7.4s, v7.4s, v14.4s -ldr q14, [x0, #400] -mla v27.4S, v15.4S, v31.s[0] -sub v15.4s, v25.4s, v1.4s -ldr q23, [x0, #320] -mla v8.4S, v29.4S, v31.s[0] -add v25.4s, v25.4s, v1.4s -ldr q1, [x0, #448] -mul v30.4S, v30.4S,v4.s[0] -sub v29.4s, v23.4s, v26.4s -ldr q22, [x0, #336] -mul v6.4S, v6.4S,v4.s[0] -add v23.4s, v23.4s, v26.4s -ldr q26, [x0, #464] -mla v30.4S, v16.4S, v31.s[0] -mla v6.4S, v28.4S, v31.s[0] -sub v28.4s, v22.4s, v0.4s -sqrdmulh v16.4S, v25.4S, v10.s[1] -add v22.4s, v22.4s, v0.4s -mul v25.4S, v25.4S,v20.s[1] -sqrdmulh v0.4S, v15.4S, v10.s[2] -sub v18.4s, v12.4s, v27.4s -mul v15.4S, v15.4S,v20.s[2] -add v12.4s, v12.4s, v27.4s -sqrdmulh v10.4S, v22.4S, v2.s[1] -sub v20.4s, v14.4s, v8.4s -mul v22.4S, v22.4S,v11.s[1] -add v14.4s, v14.4s, v8.4s -sqrdmulh v8.4S, v28.4S, v2.s[2] -sub v27.4s, v1.4s, v30.4s -mul v28.4S, v28.4S,v11.s[2] -add v1.4s, v1.4s, v30.4s -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v26.4s, v6.4s -ldr q2, [x0, #736] -sqrdmulh v11.4S, v14.4S, v17.s[1] -add v26.4s, v26.4s, v6.4s -mla v15.4S, v0.4S, v31.s[0] -ldr q0, [x0, #672] -sqrdmulh v6.4S, v20.4S, v17.s[2] -sub v30.4s, v7.4s, v25.4s -mla v22.4S, v10.4S, v31.s[0] -ldr q10, [x0, #544] -sqrdmulh v3.4S, v26.4S, v9.s[1] -add v7.4s, v7.4s, v25.4s -str q30, [x0, #272] -mla v28.4S, v8.4S, v31.s[0] -ldr q8, [x17, #+384] -ldr q30, [x17, #+400] -sqrdmulh v25.4S, v16.4S, v9.s[2] -sub v13.4s, v24.4s, v15.4s -str q7, [x0, #256] -mul v14.4S, v14.4S,v5.s[1] -add v24.4s, v24.4s, v15.4s -mul v20.4S, v20.4S,v5.s[2] -str q13, [x0, #304] -mla v14.4S, v11.4S, v31.s[0] -sub v11.4s, v23.4s, v22.4s -mla v20.4S, v6.4S, v31.s[0] -str q24, [x0, #288] -mul v26.4S, v26.4S,v4.s[1] -str q11, [x0, #336] -mul v16.4S, v16.4S,v4.s[2] -add v23.4s, v23.4s, v22.4s -str q23, [x0, #320] -mla v26.4S, v3.4S, v31.s[0] -sub v3.4s, v29.4s, v28.4s -str q3, [x0, #368] -mla v16.4S, v25.4S, v31.s[0] -add v29.4s, v29.4s, v28.4s -str q29, [x0, #352] -sqrdmulh v9.4S, v10.4S, v30.s[0] -sub v4.4s, v12.4s, v14.4s -mul v10.4S, v10.4S,v8.s[0] -str q4, [x0, #400] -ldr q4, [x0, #560] -sqrdmulh v29.4S, v4.4S, v30.s[0] -add v12.4s, v12.4s, v14.4s -mul v4.4S, v4.4S,v8.s[0] -str q12, [x0, #384] -ldr q12, [x17, #+416] -ldr q14, [x17, #+432] -ldr q28, [x0, #608] -sqrdmulh v25.4S, v28.4S, v14.s[0] -sub v3.4s, v18.4s, v20.4s -mul v28.4S, v28.4S,v12.s[0] -str q3, [x0, #432] -ldr q3, [x0, #624] -sqrdmulh v23.4S, v3.4S, v14.s[0] -add v18.4s, v18.4s, v20.4s -mul v3.4S, v3.4S,v12.s[0] -str q18, [x0, #416] -ldr q18, [x17, #+448] -ldr q20, [x17, #+464] -mla v10.4S, v9.4S, v31.s[0] -sub v9.4s, v1.4s, v26.4s -sqrdmulh v22.4S, v0.4S, v20.s[0] -str q9, [x0, #464] -ldr q9, [x0, #688] -mla v4.4S, v29.4S, v31.s[0] -add v1.4s, v1.4s, v26.4s -sqrdmulh v26.4S, v9.4S, v20.s[0] -str q1, [x0, #448] -ldr q1, [x17, #+480] -ldr q29, [x17, #+496] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v16.4s -sqrdmulh v11.4S, v2.4S, v29.s[0] -str q25, [x0, #496] -ldr q25, [x0, #752] -mla v3.4S, v23.4S, v31.s[0] -add v27.4s, v27.4s, v16.4s -sqrdmulh v16.4S, v25.4S, v29.s[0] -str q27, [x0, #480] -ldr q27, [x0, #512] -ldr q23, [x0, #640] -mul v0.4S, v0.4S,v18.s[0] -sub v17.4s, v27.4s, v10.4s -ldr q5, [x0, #528] -mul v9.4S, v9.4S,v18.s[0] -add v27.4s, v27.4s, v10.4s -ldr q10, [x0, #656] -mla v0.4S, v22.4S, v31.s[0] -sub v22.4s, v5.4s, v4.4s -ldr q24, [x0, #576] -mla v9.4S, v26.4S, v31.s[0] -add v5.4s, v5.4s, v4.4s -ldr q4, [x0, #704] -mul v2.4S, v2.4S,v1.s[0] -sub v26.4s, v24.4s, v28.4s -ldr q6, [x0, #592] -mul v25.4S, v25.4S,v1.s[0] -add v24.4s, v24.4s, v28.4s -ldr q28, [x0, #720] -mla v2.4S, v11.4S, v31.s[0] -mla v25.4S, v16.4S, v31.s[0] -sub v16.4s, v6.4s, v3.4s -sqrdmulh v11.4S, v5.4S, v30.s[1] -add v6.4s, v6.4s, v3.4s -mul v5.4S, v5.4S,v8.s[1] -sqrdmulh v3.4S, v22.4S, v30.s[2] -sub v13.4s, v23.4s, v0.4s -mul v22.4S, v22.4S,v8.s[2] -add v23.4s, v23.4s, v0.4s -sqrdmulh v30.4S, v6.4S, v14.s[1] -sub v8.4s, v10.4s, v9.4s -mul v6.4S, v6.4S,v12.s[1] -add v10.4s, v10.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v14.s[2] -sub v0.4s, v4.4s, v2.4s -mul v16.4S, v16.4S,v12.s[2] -add v4.4s, v4.4s, v2.4s -mla v5.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v25.4s -ldr q14, [x0, #992] -sqrdmulh v12.4S, v10.4S, v20.s[1] -add v28.4s, v28.4s, v25.4s -mla v22.4S, v3.4S, v31.s[0] -ldr q3, [x0, #928] -sqrdmulh v25.4S, v8.4S, v20.s[2] -sub v2.4s, v27.4s, v5.4s -mla v6.4S, v30.4S, v31.s[0] -ldr q30, [x0, #800] -sqrdmulh v15.4S, v28.4S, v29.s[1] -add v27.4s, v27.4s, v5.4s -str q2, [x0, #528] -mla v16.4S, v9.4S, v31.s[0] -ldr q9, [x17, #+512] -ldr q2, [x17, #+528] -sqrdmulh v5.4S, v11.4S, v29.s[2] -sub v7.4s, v17.4s, v22.4s -str q27, [x0, #512] -mul v10.4S, v10.4S,v18.s[1] -add v17.4s, v17.4s, v22.4s -mul v8.4S, v8.4S,v18.s[2] -str q7, [x0, #560] -mla v10.4S, v12.4S, v31.s[0] -sub v12.4s, v24.4s, v6.4s -mla v8.4S, v25.4S, v31.s[0] -str q17, [x0, #544] -mul v28.4S, v28.4S,v1.s[1] -str q12, [x0, #592] -mul v11.4S, v11.4S,v1.s[2] -add v24.4s, v24.4s, v6.4s -str q24, [x0, #576] -mla v28.4S, v15.4S, v31.s[0] -sub v15.4s, v26.4s, v16.4s -str q15, [x0, #624] -mla v11.4S, v5.4S, v31.s[0] -add v26.4s, v26.4s, v16.4s -str q26, [x0, #608] -sqrdmulh v29.4S, v30.4S, v2.s[0] -sub v1.4s, v23.4s, v10.4s -mul v30.4S, v30.4S,v9.s[0] -str q1, [x0, #656] -ldr q1, [x0, #816] -sqrdmulh v26.4S, v1.4S, v2.s[0] -add v23.4s, v23.4s, v10.4s -mul v1.4S, v1.4S,v9.s[0] -str q23, [x0, #640] -ldr q23, [x17, #+544] -ldr q10, [x17, #+560] -ldr q16, [x0, #864] -sqrdmulh v5.4S, v16.4S, v10.s[0] -sub v15.4s, v13.4s, v8.4s -mul v16.4S, v16.4S,v23.s[0] -str q15, [x0, #688] -ldr q15, [x0, #880] -sqrdmulh v24.4S, v15.4S, v10.s[0] -add v13.4s, v13.4s, v8.4s -mul v15.4S, v15.4S,v23.s[0] -str q13, [x0, #672] -ldr q13, [x17, #+576] -ldr q8, [x17, #+592] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v4.4s, v28.4s -sqrdmulh v6.4S, v3.4S, v8.s[0] -str q29, [x0, #720] -ldr q29, [x0, #944] -mla v1.4S, v26.4S, v31.s[0] -add v4.4s, v4.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v8.s[0] -str q4, [x0, #704] -ldr q4, [x17, #+608] -ldr q26, [x17, #+624] -mla v16.4S, v5.4S, v31.s[0] -sub v5.4s, v0.4s, v11.4s -sqrdmulh v12.4S, v14.4S, v26.s[0] -str q5, [x0, #752] -ldr q5, [x0, #1008] -mla v15.4S, v24.4S, v31.s[0] -add v0.4s, v0.4s, v11.4s -sqrdmulh v11.4S, v5.4S, v26.s[0] -str q0, [x0, #736] -ldr q0, [x0, #768] -ldr q24, [x0, #896] -mul v3.4S, v3.4S,v13.s[0] -sub v20.4s, v0.4s, v30.4s -ldr q18, [x0, #784] -mul v29.4S, v29.4S,v13.s[0] -add v0.4s, v0.4s, v30.4s -ldr q30, [x0, #912] -mla v3.4S, v6.4S, v31.s[0] -sub v6.4s, v18.4s, v1.4s -ldr q17, [x0, #832] -mla v29.4S, v28.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -ldr q1, [x0, #960] -mul v14.4S, v14.4S,v4.s[0] -sub v28.4s, v17.4s, v16.4s -ldr q25, [x0, #848] -mul v5.4S, v5.4S,v4.s[0] -add v17.4s, v17.4s, v16.4s -ldr q16, [x0, #976] -mla v14.4S, v12.4S, v31.s[0] -mla v5.4S, v11.4S, v31.s[0] -sub v11.4s, v25.4s, v15.4s -sqrdmulh v12.4S, v18.4S, v2.s[1] -add v25.4s, v25.4s, v15.4s -mul v18.4S, v18.4S,v9.s[1] -sqrdmulh v15.4S, v6.4S, v2.s[2] -sub v7.4s, v24.4s, v3.4s -mul v6.4S, v6.4S,v9.s[2] -add v24.4s, v24.4s, v3.4s -sqrdmulh v2.4S, v25.4S, v10.s[1] -sub v9.4s, v30.4s, v29.4s -mul v25.4S, v25.4S,v23.s[1] -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v11.4S, v10.s[2] -sub v3.4s, v1.4s, v14.4s -mul v11.4S, v11.4S,v23.s[2] -add v1.4s, v1.4s, v14.4s -mla v18.4S, v12.4S, v31.s[0] -sub v12.4s, v16.4s, v5.4s -sqrdmulh v10.4S, v30.4S, v8.s[1] -add v16.4s, v16.4s, v5.4s -mla v6.4S, v15.4S, v31.s[0] -sqrdmulh v15.4S, v9.4S, v8.s[2] -sub v5.4s, v0.4s, v18.4s -mla v25.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v16.4S, v26.s[1] -add v0.4s, v0.4s, v18.4s -str q5, [x0, #784] -mla v11.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v12.4S, v26.s[2] -sub v5.4s, v20.4s, v6.4s -str q0, [x0, #768] -mul v30.4S, v30.4S,v13.s[1] -add v20.4s, v20.4s, v6.4s -mul v9.4S, v9.4S,v13.s[2] -str q5, [x0, #816] -mla v30.4S, v10.4S, v31.s[0] -sub v10.4s, v17.4s, v25.4s -mla v9.4S, v15.4S, v31.s[0] -str q20, [x0, #800] -mul v16.4S, v16.4S,v4.s[1] -str q10, [x0, #848] -mul v12.4S, v12.4S,v4.s[2] -add v17.4s, v17.4s, v25.4s -str q17, [x0, #832] -mla v16.4S, v2.4S, v31.s[0] -sub v2.4s, v28.4s, v11.4s -str q2, [x0, #880] -mla v12.4S, v29.4S, v31.s[0] -add v28.4s, v28.4s, v11.4s -str q28, [x0, #864] -sub v26.4s, v24.4s, v30.4s -str q26, [x0, #912] -add v24.4s, v24.4s, v30.4s -str q24, [x0, #896] -sub v24.4s, v7.4s, v9.4s -str q24, [x0, #944] -add v7.4s, v7.4s, v9.4s -str q7, [x0, #928] -sub v7.4s, v1.4s, v16.4s -str q7, [x0, #976] -add v1.4s, v1.4s, v16.4s -str q1, [x0, #960] -sub v1.4s, v3.4s, v12.4s -str q1, [x0, #1008] -add v3.4s, v3.4s, v12.4s -str q3, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file diff --git a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s b/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s deleted file mode 100644 index c98a11f..0000000 --- a/tests/ntt_neon/auto/ntt_u32_incomplete_33556993_28678040_var_4_2_9_z4_7.s +++ /dev/null @@ -1,1494 +0,0 @@ - -/// -/// Copyright (c) 2021 Arm Limited -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE - - -/// -/// This assembly code has been auto-generated. -/// Don't modify it directly. -/// - -#include -modulus: -.word -33556993 -.word 0 -.word 0 -.word 0 -.align 6 -roots_merged: -.word 17702291 // Layer 0, block 0 -.word 3260327 // Layer 1, block 0 -.word 14579576 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 1132860160 // Layer 0, block 0 -.word 208645003 // Layer 1, block 0 -.word 933021652 // Layer 1, block 1 -.word 0 // Layer None, block None -.word 6733847 // Layer 2, block 0 -.word 12909577 // Layer 2, block 1 -.word 14745691 // Layer 2, block 2 -.word 13512548 // Layer 2, block 3 -.word 430933318 // Layer 2, block 0 -.word 826149873 // Layer 2, block 1 -.word 943652201 // Layer 2, block 2 -.word 864737072 // Layer 2, block 3 -.word 20428075 // Layer 3, block 0 -.word 14626653 // Layer 3, block 1 -.word 29737761 // Layer 3, block 2 -.word 30285189 // Layer 3, block 3 -.word 1307297022 // Layer 3, block 0 -.word 936034350 // Layer 3, block 1 -.word 1903071454 // Layer 3, block 2 -.word 1938104173 // Layer 3, block 3 -.word 21289485 // Layer 3, block 4 -.word 9914896 // Layer 3, block 5 -.word 22603682 // Layer 3, block 6 -.word 16204162 // Layer 3, block 7 -.word 1362423055 // Layer 3, block 4 -.word 634504916 // Layer 3, block 5 -.word 1446525244 // Layer 3, block 6 -.word 1036987221 // Layer 3, block 7 -.word 23825509 // Layer 4, block 0 -.word 9010590 // Layer 5, block 0 -.word 20699126 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 1524716204 // Layer 4, block 0 -.word 576633749 // Layer 5, block 0 -.word 1324642962 // Layer 5, block 1 -.word 0 // Layer None, block None -.word 27028662 // Layer 4, block 1 -.word 341080 // Layer 5, block 2 -.word 21220783 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 1729702351 // Layer 4, block 1 -.word 21827454 // Layer 5, block 2 -.word 1358026462 // Layer 5, block 3 -.word 0 // Layer None, block None -.word 14833295 // Layer 4, block 2 -.word 25331745 // Layer 5, block 4 -.word 5289426 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 949258429 // Layer 4, block 2 -.word 1621107951 // Layer 5, block 4 -.word 338497429 // Layer 5, block 5 -.word 0 // Layer None, block None -.word 2138810 // Layer 4, block 3 -.word 5705868 // Layer 5, block 6 -.word 17686665 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 136873393 // Layer 4, block 3 -.word 365147683 // Layer 5, block 6 -.word 1131860172 // Layer 5, block 7 -.word 0 // Layer None, block None -.word 6490403 // Layer 4, block 4 -.word 9106105 // Layer 5, block 8 -.word 18817700 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 415354091 // Layer 4, block 4 -.word 582746243 // Layer 5, block 8 -.word 1204240888 // Layer 5, block 9 -.word 0 // Layer None, block None -.word 19648405 // Layer 4, block 5 -.word 1579445 // Layer 5, block 10 -.word 7769916 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 1257401950 // Layer 4, block 5 -.word 101076765 // Layer 5, block 10 -.word 497236673 // Layer 5, block 11 -.word 0 // Layer None, block None -.word 31254932 // Layer 4, block 6 -.word 21843119 // Layer 5, block 12 -.word 11828796 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 2000162988 // Layer 4, block 6 -.word 1397852927 // Layer 5, block 12 -.word 756985168 // Layer 5, block 13 -.word 0 // Layer None, block None -.word 26362414 // Layer 4, block 7 -.word 19828530 // Layer 5, block 14 -.word 33201112 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 1687065733 // Layer 4, block 7 -.word 1268929071 // Layer 5, block 14 -.word 2124709002 // Layer 5, block 15 -.word 0 // Layer None, block None -.word 572895 // Layer 4, block 8 -.word 23713020 // Layer 5, block 16 -.word 19537976 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 36662482 // Layer 4, block 8 -.word 1517517457 // Layer 5, block 16 -.word 1250335034 // Layer 5, block 17 -.word 0 // Layer None, block None -.word 26691971 // Layer 4, block 9 -.word 8285889 // Layer 5, block 18 -.word 24690028 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 1708155771 // Layer 4, block 9 -.word 530256425 // Layer 5, block 18 -.word 1580041197 // Layer 5, block 19 -.word 0 // Layer None, block None -.word 9249292 // Layer 4, block 10 -.word 4778209 // Layer 5, block 20 -.word 13113327 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 591909511 // Layer 4, block 10 -.word 305782038 // Layer 5, block 20 -.word 839188878 // Layer 5, block 21 -.word 0 // Layer None, block None -.word 29292862 // Layer 4, block 11 -.word 25384023 // Layer 5, block 22 -.word 10905370 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 1874600091 // Layer 4, block 11 -.word 1624453488 // Layer 5, block 22 -.word 697890414 // Layer 5, block 23 -.word 0 // Layer None, block None -.word 8247799 // Layer 4, block 12 -.word 16167867 // Layer 5, block 24 -.word 22046437 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 527818851 // Layer 4, block 12 -.word 1034664519 // Layer 5, block 24 -.word 1410864286 // Layer 5, block 25 -.word 0 // Layer None, block None -.word 5086187 // Layer 4, block 13 -.word 656361 // Layer 5, block 26 -.word 18153794 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 325491125 // Layer 4, block 13 -.word 42003898 // Layer 5, block 26 -.word 1161754147 // Layer 5, block 27 -.word 0 // Layer None, block None -.word 28113639 // Layer 4, block 14 -.word 3732072 // Layer 5, block 28 -.word 22126384 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 1799135579 // Layer 4, block 14 -.word 238834379 // Layer 5, block 28 -.word 1415980503 // Layer 5, block 29 -.word 0 // Layer None, block None -.word 8471290 // Layer 4, block 15 -.word 9445744 // Layer 5, block 30 -.word 794839 // Layer 5, block 31 -.word 0 // Layer None, block None -.word 542121183 // Layer 4, block 15 -.word 604481480 // Layer 5, block 30 -.word 50865814 // Layer 5, block 31 -.word 0 // Layer None, block None -.text -.global ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7 -.global _ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7 -ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7: -_ntt_u32_incomplete_neon_asm_var_4_2_9_z4_7: -// Save GPRs -sub sp, sp, #(16*5+16) -stp x19, x20, [sp, #16*0] -stp x19, x20, [sp, #16*0] -stp x21, x22, [sp, #16*1] -stp x23, x24, [sp, #16*2] -stp x25, x26, [sp, #16*3] -stp x27, x28, [sp, #16*4] -str x29, [sp, #16*5] -// Save NEON vector registers -sub sp, sp, #(16*4) -stp d8, d9, [sp, #16*0] -stp d10, d11, [sp, #16*1] -stp d12, d13, [sp, #16*2] -stp d14, d15, [sp, #16*3] -ASM_LOAD (x17, modulus) -ldr q31, [x17] -ASM_LOAD(x17, roots_merged) -ldr q30, [x17, #+0] -ldr q29, [x17, #+16] -ldr q28, [x17, #+32] -ldr q27, [x17, #+48] -ldr q26, [x17, #+64] -ldr q25, [x17, #+80] -ldr q24, [x17, #+96] -ldr q23, [x17, #+112] -ldr q22, [x0, #928] -sqrdmulh v21.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -ldr q20, [x0, #992] -sqrdmulh v19.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -ldr q18, [x0, #800] -sqrdmulh v17.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -ldr q16, [x0, #864] -sqrdmulh v3.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -mla v22.4S, v21.4S, v31.s[0] -mla v20.4S, v19.4S, v31.s[0] -mla v18.4S, v17.4S, v31.s[0] -mla v16.4S, v3.4S, v31.s[0] -ldr q3, [x0, #544] -sqrdmulh v17.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -ldr q19, [x0, #608] -sqrdmulh v21.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -ldr q2, [x0, #672] -ldr q1, [x0, #416] -sqrdmulh v0.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v15.4s, v1.4s, v22.4s -add v1.4s, v1.4s, v22.4s -ldr q22, [x0, #736] -ldr q14, [x0, #480] -sqrdmulh v13.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v12.4s, v14.4s, v20.4s -add v14.4s, v14.4s, v20.4s -ldr q20, [x0, #288] -mla v3.4S, v17.4S, v31.s[0] -mla v19.4S, v21.4S, v31.s[0] -sub v21.4s, v20.4s, v18.4s -mla v2.4S, v0.4S, v31.s[0] -mla v22.4S, v13.4S, v31.s[0] -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #352] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v0.4s, v18.4s, v16.4s -sqrdmulh v17.4S, v14.4S, v29.s[1] -mul v14.4S, v14.4S,v30.s[1] -add v18.4s, v18.4s, v16.4s -ldr q16, [x0, #32] -sqrdmulh v11.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v10.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #96] -sqrdmulh v9.4S, v18.4S, v29.s[1] -mul v18.4S, v18.4S,v30.s[1] -sub v8.4s, v3.4s, v19.4s -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #160] -mla v1.4S, v13.4S, v31.s[0] -mla v14.4S, v17.4S, v31.s[0] -sub v17.4s, v19.4s, v2.4s -mla v20.4S, v11.4S, v31.s[0] -mla v18.4S, v9.4S, v31.s[0] -add v19.4s, v19.4s, v2.4s -ldr q2, [x0, #224] -sqrdmulh v9.4S, v15.4S, v29.s[2] -mul v15.4S, v15.4S,v30.s[2] -sub v11.4s, v2.4s, v22.4s -sqrdmulh v13.4S, v12.4S, v29.s[2] -mul v12.4S, v12.4S,v30.s[2] -add v2.4s, v2.4s, v22.4s -sqrdmulh v22.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v7.4s, v19.4s, v1.4s -add v19.4s, v19.4s, v1.4s -sqrdmulh v1.4S, v0.4S, v29.s[2] -mul v0.4S, v0.4S,v30.s[2] -sub v6.4s, v2.4s, v14.4s -add v2.4s, v2.4s, v14.4s -mla v15.4S, v9.4S, v31.s[0] -mla v12.4S, v13.4S, v31.s[0] -sub v13.4s, v16.4s, v20.4s -mla v21.4S, v22.4S, v31.s[0] -mla v0.4S, v1.4S, v31.s[0] -add v16.4s, v16.4s, v20.4s -sqrdmulh v20.4S, v7.4S, v27.s[1] -mul v7.4S, v7.4S,v28.s[1] -sub v1.4s, v3.4s, v18.4s -sqrdmulh v22.4S, v6.4S, v27.s[1] -mul v6.4S, v6.4S,v28.s[1] -add v3.4s, v3.4s, v18.4s -sqrdmulh v18.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v9.4s, v17.4s, v15.4s -add v17.4s, v17.4s, v15.4s -sqrdmulh v15.4S, v2.4S, v27.s[0] -mul v2.4S, v2.4S,v28.s[0] -sub v14.4s, v11.4s, v12.4s -add v11.4s, v11.4s, v12.4s -mla v7.4S, v20.4S, v31.s[0] -mla v6.4S, v22.4S, v31.s[0] -sub v22.4s, v10.4s, v21.4s -mla v19.4S, v18.4S, v31.s[0] -mla v2.4S, v15.4S, v31.s[0] -add v10.4s, v10.4s, v21.4s -sqrdmulh v21.4S, v17.4S, v27.s[2] -mul v17.4S, v17.4S,v28.s[2] -sub v15.4s, v8.4s, v0.4s -sqrdmulh v18.4S, v11.4S, v27.s[2] -mul v11.4S, v11.4S,v28.s[2] -add v8.4s, v8.4s, v0.4s -sqrdmulh v0.4S, v9.4S, v27.s[3] -mul v9.4S, v9.4S,v28.s[3] -sub v20.4s, v13.4s, v7.4s -add v13.4s, v13.4s, v7.4s -sqrdmulh v7.4S, v14.4S, v27.s[3] -mul v14.4S, v14.4S,v28.s[3] -sub v12.4s, v1.4s, v6.4s -add v1.4s, v1.4s, v6.4s -mla v17.4S, v21.4S, v31.s[0] -mla v11.4S, v18.4S, v31.s[0] -sub v18.4s, v16.4s, v19.4s -mla v9.4S, v0.4S, v31.s[0] -mla v14.4S, v7.4S, v31.s[0] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v7.4s, v3.4s, v2.4s -sqrdmulh v0.4S, v12.4S, v25.s[3] -mul v12.4S, v12.4S,v26.s[3] -add v3.4s, v3.4s, v2.4s -sqrdmulh v2.4S, v7.4S, v25.s[1] -mul v7.4S, v7.4S,v26.s[1] -sub v21.4s, v10.4s, v17.4s -add v10.4s, v10.4s, v17.4s -sqrdmulh v17.4S, v3.4S, v25.s[0] -mul v3.4S, v3.4S,v26.s[0] -sub v6.4s, v8.4s, v11.4s -add v8.4s, v8.4s, v11.4s -mla v1.4S, v19.4S, v31.s[0] -mla v12.4S, v0.4S, v31.s[0] -sub v0.4s, v22.4s, v9.4s -mla v7.4S, v2.4S, v31.s[0] -mla v3.4S, v17.4S, v31.s[0] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v8.4S, v23.s[0] -mul v8.4S, v8.4S,v24.s[0] -sub v17.4s, v15.4s, v14.4s -sqrdmulh v2.4S, v6.4S, v23.s[1] -mul v6.4S, v6.4S,v24.s[1] -add v15.4s, v15.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v23.s[2] -mul v15.4S, v15.4S,v24.s[2] -sub v19.4s, v13.4s, v1.4s -add v13.4s, v13.4s, v1.4s -sqrdmulh v1.4S, v17.4S, v23.s[3] -mul v17.4S, v17.4S,v24.s[3] -sub v11.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v8.4S, v9.4S, v31.s[0] -mla v6.4S, v2.4S, v31.s[0] -sub v2.4s, v18.4s, v7.4s -str q13, [x0, #288] -mla v15.4S, v14.4S, v31.s[0] -mla v17.4S, v1.4S, v31.s[0] -add v18.4s, v18.4s, v7.4s -str q19, [x0, #352] -ldr q19, [x0, #944] -sqrdmulh v7.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -str q20, [x0, #416] -sub v20.4s, v16.4s, v3.4s -ldr q1, [x0, #1008] -sqrdmulh v14.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -str q11, [x0, #480] -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #816] -sqrdmulh v11.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -sub v13.4s, v10.4s, v8.4s -add v10.4s, v10.4s, v8.4s -ldr q8, [x0, #880] -sqrdmulh v9.4S, v8.4S, v29.s[0] -mul v8.4S, v8.4S,v30.s[0] -sub v12.4s, v21.4s, v6.4s -add v21.4s, v21.4s, v6.4s -mla v19.4S, v7.4S, v31.s[0] -mla v1.4S, v14.4S, v31.s[0] -str q18, [x0, #160] -sub v18.4s, v22.4s, v15.4s -mla v3.4S, v11.4S, v31.s[0] -mla v8.4S, v9.4S, v31.s[0] -str q2, [x0, #224] -add v22.4s, v22.4s, v15.4s -ldr q15, [x0, #560] -sqrdmulh v2.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -str q16, [x0, #32] -sub v16.4s, v0.4s, v17.4s -ldr q9, [x0, #624] -sqrdmulh v11.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -str q20, [x0, #96] -add v0.4s, v0.4s, v17.4s -ldr q17, [x0, #688] -ldr q20, [x0, #432] -sqrdmulh v14.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v7.4s, v20.4s, v19.4s -add v20.4s, v20.4s, v19.4s -ldr q19, [x0, #752] -ldr q6, [x0, #496] -sqrdmulh v5.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v4.4s, v6.4s, v1.4s -add v6.4s, v6.4s, v1.4s -ldr q1, [x0, #304] -mla v15.4S, v2.4S, v31.s[0] -mla v9.4S, v11.4S, v31.s[0] -str q10, [x0, #544] -sub v10.4s, v1.4s, v3.4s -mla v17.4S, v14.4S, v31.s[0] -mla v19.4S, v5.4S, v31.s[0] -str q13, [x0, #608] -add v1.4s, v1.4s, v3.4s -ldr q3, [x0, #368] -sqrdmulh v13.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -str q21, [x0, #672] -sub v21.4s, v3.4s, v8.4s -sqrdmulh v5.4S, v6.4S, v29.s[1] -mul v6.4S, v6.4S,v30.s[1] -str q12, [x0, #736] -add v3.4s, v3.4s, v8.4s -ldr q8, [x0, #48] -sqrdmulh v12.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v14.4s, v8.4s, v15.4s -add v8.4s, v8.4s, v15.4s -ldr q15, [x0, #112] -sqrdmulh v11.4S, v3.4S, v29.s[1] -mul v3.4S, v3.4S,v30.s[1] -sub v2.4s, v15.4s, v9.4s -add v15.4s, v15.4s, v9.4s -ldr q9, [x0, #176] -mla v20.4S, v13.4S, v31.s[0] -mla v6.4S, v5.4S, v31.s[0] -str q22, [x0, #800] -sub v22.4s, v9.4s, v17.4s -mla v1.4S, v12.4S, v31.s[0] -mla v3.4S, v11.4S, v31.s[0] -str q18, [x0, #864] -add v9.4s, v9.4s, v17.4s -ldr q17, [x0, #240] -sqrdmulh v18.4S, v7.4S, v29.s[2] -mul v7.4S, v7.4S,v30.s[2] -str q0, [x0, #928] -sub v0.4s, v17.4s, v19.4s -sqrdmulh v11.4S, v4.4S, v29.s[2] -mul v4.4S, v4.4S,v30.s[2] -str q16, [x0, #992] -add v17.4s, v17.4s, v19.4s -sqrdmulh v19.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v16.4s, v9.4s, v20.4s -add v9.4s, v9.4s, v20.4s -sqrdmulh v20.4S, v21.4S, v29.s[2] -mul v21.4S, v21.4S,v30.s[2] -sub v12.4s, v17.4s, v6.4s -add v17.4s, v17.4s, v6.4s -mla v7.4S, v18.4S, v31.s[0] -mla v4.4S, v11.4S, v31.s[0] -sub v11.4s, v8.4s, v1.4s -mla v10.4S, v19.4S, v31.s[0] -mla v21.4S, v20.4S, v31.s[0] -add v8.4s, v8.4s, v1.4s -sqrdmulh v1.4S, v16.4S, v27.s[1] -mul v16.4S, v16.4S,v28.s[1] -sub v20.4s, v15.4s, v3.4s -sqrdmulh v19.4S, v12.4S, v27.s[1] -mul v12.4S, v12.4S,v28.s[1] -add v15.4s, v15.4s, v3.4s -sqrdmulh v3.4S, v9.4S, v27.s[0] -mul v9.4S, v9.4S,v28.s[0] -sub v18.4s, v22.4s, v7.4s -add v22.4s, v22.4s, v7.4s -sqrdmulh v7.4S, v17.4S, v27.s[0] -mul v17.4S, v17.4S,v28.s[0] -sub v6.4s, v0.4s, v4.4s -add v0.4s, v0.4s, v4.4s -mla v16.4S, v1.4S, v31.s[0] -mla v12.4S, v19.4S, v31.s[0] -sub v19.4s, v14.4s, v10.4s -mla v9.4S, v3.4S, v31.s[0] -mla v17.4S, v7.4S, v31.s[0] -add v14.4s, v14.4s, v10.4s -sqrdmulh v10.4S, v22.4S, v27.s[2] -mul v22.4S, v22.4S,v28.s[2] -sub v7.4s, v2.4s, v21.4s -sqrdmulh v3.4S, v0.4S, v27.s[2] -mul v0.4S, v0.4S,v28.s[2] -add v2.4s, v2.4s, v21.4s -sqrdmulh v21.4S, v18.4S, v27.s[3] -mul v18.4S, v18.4S,v28.s[3] -sub v1.4s, v11.4s, v16.4s -add v11.4s, v11.4s, v16.4s -sqrdmulh v16.4S, v6.4S, v27.s[3] -mul v6.4S, v6.4S,v28.s[3] -sub v4.4s, v20.4s, v12.4s -add v20.4s, v20.4s, v12.4s -mla v22.4S, v10.4S, v31.s[0] -mla v0.4S, v3.4S, v31.s[0] -sub v3.4s, v8.4s, v9.4s -mla v18.4S, v21.4S, v31.s[0] -mla v6.4S, v16.4S, v31.s[0] -add v8.4s, v8.4s, v9.4s -sqrdmulh v9.4S, v20.4S, v25.s[2] -mul v20.4S, v20.4S,v26.s[2] -sub v16.4s, v15.4s, v17.4s -sqrdmulh v21.4S, v4.4S, v25.s[3] -mul v4.4S, v4.4S,v26.s[3] -add v15.4s, v15.4s, v17.4s -sqrdmulh v17.4S, v16.4S, v25.s[1] -mul v16.4S, v16.4S,v26.s[1] -sub v10.4s, v14.4s, v22.4s -add v14.4s, v14.4s, v22.4s -sqrdmulh v22.4S, v15.4S, v25.s[0] -mul v15.4S, v15.4S,v26.s[0] -sub v12.4s, v2.4s, v0.4s -add v2.4s, v2.4s, v0.4s -mla v20.4S, v9.4S, v31.s[0] -mla v4.4S, v21.4S, v31.s[0] -sub v21.4s, v19.4s, v18.4s -mla v16.4S, v17.4S, v31.s[0] -mla v15.4S, v22.4S, v31.s[0] -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v2.4S, v23.s[0] -mul v2.4S, v2.4S,v24.s[0] -sub v22.4s, v7.4s, v6.4s -sqrdmulh v17.4S, v12.4S, v23.s[1] -mul v12.4S, v12.4S,v24.s[1] -add v7.4s, v7.4s, v6.4s -sqrdmulh v6.4S, v7.4S, v23.s[2] -mul v7.4S, v7.4S,v24.s[2] -sub v9.4s, v11.4s, v20.4s -add v11.4s, v11.4s, v20.4s -sqrdmulh v20.4S, v22.4S, v23.s[3] -mul v22.4S, v22.4S,v24.s[3] -sub v0.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -mla v2.4S, v18.4S, v31.s[0] -mla v12.4S, v17.4S, v31.s[0] -sub v17.4s, v3.4s, v16.4s -str q11, [x0, #304] -mla v7.4S, v6.4S, v31.s[0] -mla v22.4S, v20.4S, v31.s[0] -add v3.4s, v3.4s, v16.4s -str q9, [x0, #368] -ldr q9, [x0, #896] -sqrdmulh v16.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -str q1, [x0, #432] -sub v1.4s, v8.4s, v15.4s -ldr q20, [x0, #960] -sqrdmulh v6.4S, v20.4S, v29.s[0] -mul v20.4S, v20.4S,v30.s[0] -str q0, [x0, #496] -add v8.4s, v8.4s, v15.4s -ldr q15, [x0, #768] -sqrdmulh v0.4S, v15.4S, v29.s[0] -mul v15.4S, v15.4S,v30.s[0] -sub v11.4s, v14.4s, v2.4s -add v14.4s, v14.4s, v2.4s -ldr q2, [x0, #832] -sqrdmulh v18.4S, v2.4S, v29.s[0] -mul v2.4S, v2.4S,v30.s[0] -sub v4.4s, v10.4s, v12.4s -add v10.4s, v10.4s, v12.4s -mla v9.4S, v16.4S, v31.s[0] -mla v20.4S, v6.4S, v31.s[0] -str q3, [x0, #176] -sub v3.4s, v19.4s, v7.4s -mla v15.4S, v0.4S, v31.s[0] -mla v2.4S, v18.4S, v31.s[0] -str q17, [x0, #240] -add v19.4s, v19.4s, v7.4s -ldr q7, [x0, #512] -sqrdmulh v17.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -str q8, [x0, #48] -sub v8.4s, v21.4s, v22.4s -ldr q18, [x0, #576] -sqrdmulh v0.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q1, [x0, #112] -add v21.4s, v21.4s, v22.4s -ldr q22, [x0, #640] -ldr q1, [x0, #384] -sqrdmulh v6.4S, v22.4S, v29.s[0] -mul v22.4S, v22.4S,v30.s[0] -sub v16.4s, v1.4s, v9.4s -add v1.4s, v1.4s, v9.4s -ldr q9, [x0, #704] -ldr q12, [x0, #448] -sqrdmulh v5.4S, v9.4S, v29.s[0] -mul v9.4S, v9.4S,v30.s[0] -sub v13.4s, v12.4s, v20.4s -add v12.4s, v12.4s, v20.4s -ldr q20, [x0, #256] -mla v7.4S, v17.4S, v31.s[0] -mla v18.4S, v0.4S, v31.s[0] -str q14, [x0, #560] -sub v14.4s, v20.4s, v15.4s -mla v22.4S, v6.4S, v31.s[0] -mla v9.4S, v5.4S, v31.s[0] -str q11, [x0, #624] -add v20.4s, v20.4s, v15.4s -ldr q15, [x0, #320] -sqrdmulh v11.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -str q10, [x0, #688] -sub v10.4s, v15.4s, v2.4s -sqrdmulh v5.4S, v12.4S, v29.s[1] -mul v12.4S, v12.4S,v30.s[1] -str q4, [x0, #752] -add v15.4s, v15.4s, v2.4s -ldr q2, [x0, #0] -sqrdmulh v4.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -sub v6.4s, v2.4s, v7.4s -add v2.4s, v2.4s, v7.4s -ldr q7, [x0, #64] -sqrdmulh v0.4S, v15.4S, v29.s[1] -mul v15.4S, v15.4S,v30.s[1] -sub v17.4s, v7.4s, v18.4s -add v7.4s, v7.4s, v18.4s -ldr q18, [x0, #128] -mla v1.4S, v11.4S, v31.s[0] -mla v12.4S, v5.4S, v31.s[0] -str q19, [x0, #816] -sub v19.4s, v18.4s, v22.4s -mla v20.4S, v4.4S, v31.s[0] -mla v15.4S, v0.4S, v31.s[0] -str q3, [x0, #880] -add v18.4s, v18.4s, v22.4s -ldr q22, [x0, #192] -sqrdmulh v3.4S, v16.4S, v29.s[2] -mul v16.4S, v16.4S,v30.s[2] -str q21, [x0, #944] -sub v21.4s, v22.4s, v9.4s -sqrdmulh v0.4S, v13.4S, v29.s[2] -mul v13.4S, v13.4S,v30.s[2] -str q8, [x0, #1008] -add v22.4s, v22.4s, v9.4s -sqrdmulh v9.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v8.4s, v18.4s, v1.4s -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v10.4S, v29.s[2] -mul v10.4S, v10.4S,v30.s[2] -sub v4.4s, v22.4s, v12.4s -add v22.4s, v22.4s, v12.4s -mla v16.4S, v3.4S, v31.s[0] -mla v13.4S, v0.4S, v31.s[0] -sub v0.4s, v2.4s, v20.4s -mla v14.4S, v9.4S, v31.s[0] -mla v10.4S, v1.4S, v31.s[0] -add v2.4s, v2.4s, v20.4s -sqrdmulh v20.4S, v8.4S, v27.s[1] -mul v8.4S, v8.4S,v28.s[1] -sub v1.4s, v7.4s, v15.4s -sqrdmulh v9.4S, v4.4S, v27.s[1] -mul v4.4S, v4.4S,v28.s[1] -add v7.4s, v7.4s, v15.4s -sqrdmulh v15.4S, v18.4S, v27.s[0] -mul v18.4S, v18.4S,v28.s[0] -sub v3.4s, v19.4s, v16.4s -add v19.4s, v19.4s, v16.4s -sqrdmulh v16.4S, v22.4S, v27.s[0] -mul v22.4S, v22.4S,v28.s[0] -sub v12.4s, v21.4s, v13.4s -add v21.4s, v21.4s, v13.4s -mla v8.4S, v20.4S, v31.s[0] -mla v4.4S, v9.4S, v31.s[0] -sub v9.4s, v6.4s, v14.4s -mla v18.4S, v15.4S, v31.s[0] -mla v22.4S, v16.4S, v31.s[0] -add v6.4s, v6.4s, v14.4s -sqrdmulh v14.4S, v19.4S, v27.s[2] -mul v19.4S, v19.4S,v28.s[2] -sub v16.4s, v17.4s, v10.4s -sqrdmulh v15.4S, v21.4S, v27.s[2] -mul v21.4S, v21.4S,v28.s[2] -add v17.4s, v17.4s, v10.4s -sqrdmulh v10.4S, v3.4S, v27.s[3] -mul v3.4S, v3.4S,v28.s[3] -sub v20.4s, v0.4s, v8.4s -add v0.4s, v0.4s, v8.4s -sqrdmulh v8.4S, v12.4S, v27.s[3] -mul v12.4S, v12.4S,v28.s[3] -sub v13.4s, v1.4s, v4.4s -add v1.4s, v1.4s, v4.4s -mla v19.4S, v14.4S, v31.s[0] -mla v21.4S, v15.4S, v31.s[0] -sub v15.4s, v2.4s, v18.4s -mla v3.4S, v10.4S, v31.s[0] -mla v12.4S, v8.4S, v31.s[0] -add v2.4s, v2.4s, v18.4s -sqrdmulh v18.4S, v1.4S, v25.s[2] -mul v1.4S, v1.4S,v26.s[2] -sub v8.4s, v7.4s, v22.4s -sqrdmulh v10.4S, v13.4S, v25.s[3] -mul v13.4S, v13.4S,v26.s[3] -add v7.4s, v7.4s, v22.4s -sqrdmulh v22.4S, v8.4S, v25.s[1] -mul v8.4S, v8.4S,v26.s[1] -sub v14.4s, v6.4s, v19.4s -add v6.4s, v6.4s, v19.4s -sqrdmulh v19.4S, v7.4S, v25.s[0] -mul v7.4S, v7.4S,v26.s[0] -sub v4.4s, v17.4s, v21.4s -add v17.4s, v17.4s, v21.4s -mla v1.4S, v18.4S, v31.s[0] -mla v13.4S, v10.4S, v31.s[0] -sub v10.4s, v9.4s, v3.4s -mla v8.4S, v22.4S, v31.s[0] -mla v7.4S, v19.4S, v31.s[0] -add v9.4s, v9.4s, v3.4s -sqrdmulh v3.4S, v17.4S, v23.s[0] -mul v17.4S, v17.4S,v24.s[0] -sub v19.4s, v16.4s, v12.4s -sqrdmulh v22.4S, v4.4S, v23.s[1] -mul v4.4S, v4.4S,v24.s[1] -add v16.4s, v16.4s, v12.4s -sqrdmulh v12.4S, v16.4S, v23.s[2] -mul v16.4S, v16.4S,v24.s[2] -sub v18.4s, v0.4s, v1.4s -add v0.4s, v0.4s, v1.4s -sqrdmulh v1.4S, v19.4S, v23.s[3] -mul v19.4S, v19.4S,v24.s[3] -sub v21.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -mla v17.4S, v3.4S, v31.s[0] -mla v4.4S, v22.4S, v31.s[0] -sub v22.4s, v15.4s, v8.4s -str q0, [x0, #256] -mla v16.4S, v12.4S, v31.s[0] -mla v19.4S, v1.4S, v31.s[0] -add v15.4s, v15.4s, v8.4s -str q18, [x0, #320] -ldr q18, [x0, #912] -sqrdmulh v8.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -str q20, [x0, #384] -sub v20.4s, v2.4s, v7.4s -ldr q1, [x0, #976] -sqrdmulh v12.4S, v1.4S, v29.s[0] -mul v1.4S, v1.4S,v30.s[0] -str q21, [x0, #448] -add v2.4s, v2.4s, v7.4s -ldr q7, [x0, #784] -sqrdmulh v21.4S, v7.4S, v29.s[0] -mul v7.4S, v7.4S,v30.s[0] -sub v0.4s, v6.4s, v17.4s -add v6.4s, v6.4s, v17.4s -ldr q17, [x0, #848] -sqrdmulh v3.4S, v17.4S, v29.s[0] -mul v17.4S, v17.4S,v30.s[0] -sub v13.4s, v14.4s, v4.4s -add v14.4s, v14.4s, v4.4s -mla v18.4S, v8.4S, v31.s[0] -mla v1.4S, v12.4S, v31.s[0] -str q15, [x0, #128] -sub v15.4s, v9.4s, v16.4s -mla v7.4S, v21.4S, v31.s[0] -mla v17.4S, v3.4S, v31.s[0] -str q22, [x0, #192] -add v9.4s, v9.4s, v16.4s -ldr q16, [x0, #528] -sqrdmulh v22.4S, v16.4S, v29.s[0] -mul v16.4S, v16.4S,v30.s[0] -str q2, [x0, #0] -sub v2.4s, v10.4s, v19.4s -ldr q3, [x0, #592] -sqrdmulh v21.4S, v3.4S, v29.s[0] -mul v3.4S, v3.4S,v30.s[0] -str q20, [x0, #64] -add v10.4s, v10.4s, v19.4s -ldr q19, [x0, #656] -ldr q20, [x0, #400] -sqrdmulh v12.4S, v19.4S, v29.s[0] -mul v19.4S, v19.4S,v30.s[0] -sub v8.4s, v20.4s, v18.4s -add v20.4s, v20.4s, v18.4s -ldr q18, [x0, #720] -ldr q4, [x0, #464] -sqrdmulh v5.4S, v18.4S, v29.s[0] -mul v18.4S, v18.4S,v30.s[0] -sub v11.4s, v4.4s, v1.4s -add v4.4s, v4.4s, v1.4s -ldr q1, [x0, #272] -mla v16.4S, v22.4S, v31.s[0] -mla v3.4S, v21.4S, v31.s[0] -str q6, [x0, #512] -sub v6.4s, v1.4s, v7.4s -mla v19.4S, v12.4S, v31.s[0] -mla v18.4S, v5.4S, v31.s[0] -str q0, [x0, #576] -add v1.4s, v1.4s, v7.4s -ldr q7, [x0, #336] -sqrdmulh v0.4S, v20.4S, v29.s[1] -mul v20.4S, v20.4S,v30.s[1] -str q14, [x0, #640] -sub v14.4s, v7.4s, v17.4s -sqrdmulh v5.4S, v4.4S, v29.s[1] -mul v4.4S, v4.4S,v30.s[1] -str q13, [x0, #704] -add v7.4s, v7.4s, v17.4s -ldr q17, [x0, #16] -sqrdmulh v13.4S, v1.4S, v29.s[1] -mul v1.4S, v1.4S,v30.s[1] -sub v12.4s, v17.4s, v16.4s -add v17.4s, v17.4s, v16.4s -ldr q16, [x0, #80] -sqrdmulh v21.4S, v7.4S, v29.s[1] -mul v7.4S, v7.4S,v30.s[1] -sub v22.4s, v16.4s, v3.4s -add v16.4s, v16.4s, v3.4s -ldr q3, [x0, #144] -mla v20.4S, v0.4S, v31.s[0] -mla v4.4S, v5.4S, v31.s[0] -str q9, [x0, #768] -sub v9.4s, v3.4s, v19.4s -mla v1.4S, v13.4S, v31.s[0] -mla v7.4S, v21.4S, v31.s[0] -str q15, [x0, #832] -add v3.4s, v3.4s, v19.4s -ldr q19, [x0, #208] -sqrdmulh v15.4S, v8.4S, v29.s[2] -mul v8.4S, v8.4S,v30.s[2] -str q10, [x0, #896] -sub v10.4s, v19.4s, v18.4s -sqrdmulh v21.4S, v11.4S, v29.s[2] -mul v11.4S, v11.4S,v30.s[2] -str q2, [x0, #960] -add v19.4s, v19.4s, v18.4s -sqrdmulh v18.4S, v6.4S, v29.s[2] -mul v6.4S, v6.4S,v30.s[2] -sub v2.4s, v3.4s, v20.4s -add v3.4s, v3.4s, v20.4s -sqrdmulh v20.4S, v14.4S, v29.s[2] -mul v14.4S, v14.4S,v30.s[2] -sub v13.4s, v19.4s, v4.4s -add v19.4s, v19.4s, v4.4s -mla v8.4S, v15.4S, v31.s[0] -mla v11.4S, v21.4S, v31.s[0] -sub v21.4s, v17.4s, v1.4s -mla v6.4S, v18.4S, v31.s[0] -mla v14.4S, v20.4S, v31.s[0] -add v17.4s, v17.4s, v1.4s -sqrdmulh v1.4S, v2.4S, v27.s[1] -mul v2.4S, v2.4S,v28.s[1] -sub v20.4s, v16.4s, v7.4s -sqrdmulh v18.4S, v13.4S, v27.s[1] -mul v13.4S, v13.4S,v28.s[1] -add v16.4s, v16.4s, v7.4s -sqrdmulh v7.4S, v3.4S, v27.s[0] -mul v3.4S, v3.4S,v28.s[0] -sub v15.4s, v9.4s, v8.4s -add v9.4s, v9.4s, v8.4s -sqrdmulh v8.4S, v19.4S, v27.s[0] -mul v19.4S, v19.4S,v28.s[0] -sub v4.4s, v10.4s, v11.4s -add v10.4s, v10.4s, v11.4s -mla v2.4S, v1.4S, v31.s[0] -mla v13.4S, v18.4S, v31.s[0] -sub v18.4s, v12.4s, v6.4s -mla v3.4S, v7.4S, v31.s[0] -mla v19.4S, v8.4S, v31.s[0] -add v12.4s, v12.4s, v6.4s -sqrdmulh v6.4S, v9.4S, v27.s[2] -mul v9.4S, v9.4S,v28.s[2] -sub v8.4s, v22.4s, v14.4s -sqrdmulh v7.4S, v10.4S, v27.s[2] -mul v10.4S, v10.4S,v28.s[2] -add v22.4s, v22.4s, v14.4s -sqrdmulh v14.4S, v15.4S, v27.s[3] -mul v15.4S, v15.4S,v28.s[3] -sub v1.4s, v21.4s, v2.4s -add v21.4s, v21.4s, v2.4s -sqrdmulh v2.4S, v4.4S, v27.s[3] -mul v4.4S, v4.4S,v28.s[3] -sub v11.4s, v20.4s, v13.4s -add v20.4s, v20.4s, v13.4s -mla v9.4S, v6.4S, v31.s[0] -mla v10.4S, v7.4S, v31.s[0] -sub v7.4s, v17.4s, v3.4s -mla v15.4S, v14.4S, v31.s[0] -mla v4.4S, v2.4S, v31.s[0] -add v17.4s, v17.4s, v3.4s -sqrdmulh v3.4S, v20.4S, v25.s[2] -mul v20.4S, v20.4S,v26.s[2] -sub v2.4s, v16.4s, v19.4s -sqrdmulh v14.4S, v11.4S, v25.s[3] -mul v11.4S, v11.4S,v26.s[3] -add v16.4s, v16.4s, v19.4s -sqrdmulh v19.4S, v2.4S, v25.s[1] -mul v2.4S, v2.4S,v26.s[1] -sub v6.4s, v12.4s, v9.4s -add v12.4s, v12.4s, v9.4s -sqrdmulh v9.4S, v16.4S, v25.s[0] -mul v16.4S, v16.4S,v26.s[0] -sub v13.4s, v22.4s, v10.4s -add v22.4s, v22.4s, v10.4s -mla v20.4S, v3.4S, v31.s[0] -mla v11.4S, v14.4S, v31.s[0] -sub v14.4s, v18.4s, v15.4s -mla v2.4S, v19.4S, v31.s[0] -mla v16.4S, v9.4S, v31.s[0] -add v18.4s, v18.4s, v15.4s -sqrdmulh v15.4S, v22.4S, v23.s[0] -mul v22.4S, v22.4S,v24.s[0] -sub v9.4s, v8.4s, v4.4s -sqrdmulh v19.4S, v13.4S, v23.s[1] -mul v13.4S, v13.4S,v24.s[1] -add v8.4s, v8.4s, v4.4s -sqrdmulh v4.4S, v8.4S, v23.s[2] -mul v8.4S, v8.4S,v24.s[2] -sub v3.4s, v21.4s, v20.4s -add v21.4s, v21.4s, v20.4s -sqrdmulh v20.4S, v9.4S, v23.s[3] -mul v9.4S, v9.4S,v24.s[3] -sub v10.4s, v1.4s, v11.4s -add v1.4s, v1.4s, v11.4s -mla v22.4S, v15.4S, v31.s[0] -mla v13.4S, v19.4S, v31.s[0] -sub v19.4s, v7.4s, v2.4s -str q21, [x0, #272] -mla v8.4S, v4.4S, v31.s[0] -mla v9.4S, v20.4S, v31.s[0] -add v7.4s, v7.4s, v2.4s -str q3, [x0, #336] -str q1, [x0, #400] -sub v1.4s, v17.4s, v16.4s -str q10, [x0, #464] -add v17.4s, v17.4s, v16.4s -sub v16.4s, v12.4s, v22.4s -add v12.4s, v12.4s, v22.4s -sub v22.4s, v6.4s, v13.4s -add v6.4s, v6.4s, v13.4s -str q7, [x0, #144] -sub v7.4s, v18.4s, v8.4s -str q19, [x0, #208] -add v18.4s, v18.4s, v8.4s -str q17, [x0, #16] -sub v17.4s, v14.4s, v9.4s -str q1, [x0, #80] -add v14.4s, v14.4s, v9.4s -str q12, [x0, #528] -str q16, [x0, #592] -str q6, [x0, #656] -str q22, [x0, #720] -str q18, [x0, #784] -str q7, [x0, #848] -str q14, [x0, #912] -str q17, [x0, #976] -ldr q0, [x0, #224] -ldr q5, [x0, #160] -ldr q11, [x0, #32] -ldr q15, [x17, #+128] -ldr q21, [x17, #+144] -sqrdmulh v4.4S, v11.4S, v21.s[0] -mul v11.4S, v11.4S,v15.s[0] -ldr q20, [x0, #48] -sqrdmulh v2.4S, v20.4S, v21.s[0] -mul v20.4S, v20.4S,v15.s[0] -ldr q3, [x17, #+160] -ldr q30, [x17, #+176] -ldr q29, [x0, #96] -sqrdmulh v28.4S, v29.4S, v30.s[0] -mul v29.4S, v29.4S,v3.s[0] -ldr q27, [x0, #112] -sqrdmulh v26.4S, v27.4S, v30.s[0] -mul v27.4S, v27.4S,v3.s[0] -ldr q25, [x17, #+192] -ldr q24, [x17, #+208] -mla v11.4S, v4.4S, v31.s[0] -sqrdmulh v4.4S, v5.4S, v24.s[0] -ldr q23, [x0, #176] -mla v20.4S, v2.4S, v31.s[0] -sqrdmulh v2.4S, v23.4S, v24.s[0] -ldr q10, [x17, #+224] -ldr q13, [x17, #+240] -mla v29.4S, v28.4S, v31.s[0] -sqrdmulh v28.4S, v0.4S, v13.s[0] -ldr q19, [x0, #240] -mla v27.4S, v26.4S, v31.s[0] -sqrdmulh v26.4S, v19.4S, v13.s[0] -ldr q8, [x0, #0] -ldr q1, [x0, #128] -mul v5.4S, v5.4S,v25.s[0] -sub v9.4s, v8.4s, v11.4s -ldr q12, [x0, #16] -mul v23.4S, v23.4S,v25.s[0] -add v8.4s, v8.4s, v11.4s -ldr q11, [x0, #144] -mla v5.4S, v4.4S, v31.s[0] -sub v4.4s, v12.4s, v20.4s -ldr q16, [x0, #64] -mla v23.4S, v2.4S, v31.s[0] -add v12.4s, v12.4s, v20.4s -ldr q20, [x0, #192] -mul v0.4S, v0.4S,v10.s[0] -sub v2.4s, v16.4s, v29.4s -ldr q6, [x0, #80] -mul v19.4S, v19.4S,v10.s[0] -add v16.4s, v16.4s, v29.4s -ldr q29, [x0, #208] -mla v0.4S, v28.4S, v31.s[0] -mla v19.4S, v26.4S, v31.s[0] -sub v26.4s, v6.4s, v27.4s -sqrdmulh v28.4S, v12.4S, v21.s[1] -add v6.4s, v6.4s, v27.4s -mul v12.4S, v12.4S,v15.s[1] -sqrdmulh v27.4S, v4.4S, v21.s[2] -sub v22.4s, v1.4s, v5.4s -mul v4.4S, v4.4S,v15.s[2] -add v1.4s, v1.4s, v5.4s -sqrdmulh v21.4S, v6.4S, v30.s[1] -sub v15.4s, v11.4s, v23.4s -mul v6.4S, v6.4S,v3.s[1] -add v11.4s, v11.4s, v23.4s -sqrdmulh v23.4S, v26.4S, v30.s[2] -sub v5.4s, v20.4s, v0.4s -mul v26.4S, v26.4S,v3.s[2] -add v20.4s, v20.4s, v0.4s -mla v12.4S, v28.4S, v31.s[0] -sub v28.4s, v29.4s, v19.4s -ldr q30, [x0, #480] -sqrdmulh v3.4S, v11.4S, v24.s[1] -add v29.4s, v29.4s, v19.4s -mla v4.4S, v27.4S, v31.s[0] -ldr q27, [x0, #416] -sqrdmulh v19.4S, v15.4S, v24.s[2] -sub v0.4s, v8.4s, v12.4s -mla v6.4S, v21.4S, v31.s[0] -ldr q21, [x0, #288] -sqrdmulh v18.4S, v29.4S, v13.s[1] -add v8.4s, v8.4s, v12.4s -str q0, [x0, #16] -mla v26.4S, v23.4S, v31.s[0] -ldr q23, [x17, #+256] -ldr q0, [x17, #+272] -sqrdmulh v12.4S, v28.4S, v13.s[2] -sub v7.4s, v9.4s, v4.4s -str q8, [x0, #0] -mul v11.4S, v11.4S,v25.s[1] -add v9.4s, v9.4s, v4.4s -mul v15.4S, v15.4S,v25.s[2] -str q7, [x0, #48] -mla v11.4S, v3.4S, v31.s[0] -sub v3.4s, v16.4s, v6.4s -mla v15.4S, v19.4S, v31.s[0] -str q9, [x0, #32] -mul v29.4S, v29.4S,v10.s[1] -str q3, [x0, #80] -mul v28.4S, v28.4S,v10.s[2] -add v16.4s, v16.4s, v6.4s -str q16, [x0, #64] -mla v29.4S, v18.4S, v31.s[0] -sub v18.4s, v2.4s, v26.4s -str q18, [x0, #112] -mla v28.4S, v12.4S, v31.s[0] -add v2.4s, v2.4s, v26.4s -str q2, [x0, #96] -sqrdmulh v13.4S, v21.4S, v0.s[0] -sub v10.4s, v1.4s, v11.4s -mul v21.4S, v21.4S,v23.s[0] -str q10, [x0, #144] -ldr q10, [x0, #304] -sqrdmulh v2.4S, v10.4S, v0.s[0] -add v1.4s, v1.4s, v11.4s -mul v10.4S, v10.4S,v23.s[0] -str q1, [x0, #128] -ldr q1, [x17, #+288] -ldr q11, [x17, #+304] -ldr q26, [x0, #352] -sqrdmulh v12.4S, v26.4S, v11.s[0] -sub v18.4s, v22.4s, v15.4s -mul v26.4S, v26.4S,v1.s[0] -str q18, [x0, #176] -ldr q18, [x0, #368] -sqrdmulh v16.4S, v18.4S, v11.s[0] -add v22.4s, v22.4s, v15.4s -mul v18.4S, v18.4S,v1.s[0] -str q22, [x0, #160] -ldr q22, [x17, #+320] -ldr q15, [x17, #+336] -mla v21.4S, v13.4S, v31.s[0] -sub v13.4s, v20.4s, v29.4s -sqrdmulh v6.4S, v27.4S, v15.s[0] -str q13, [x0, #208] -ldr q13, [x0, #432] -mla v10.4S, v2.4S, v31.s[0] -add v20.4s, v20.4s, v29.4s -sqrdmulh v29.4S, v13.4S, v15.s[0] -str q20, [x0, #192] -ldr q20, [x17, #+352] -ldr q2, [x17, #+368] -mla v26.4S, v12.4S, v31.s[0] -sub v12.4s, v5.4s, v28.4s -sqrdmulh v3.4S, v30.4S, v2.s[0] -str q12, [x0, #240] -ldr q12, [x0, #496] -mla v18.4S, v16.4S, v31.s[0] -add v5.4s, v5.4s, v28.4s -sqrdmulh v28.4S, v12.4S, v2.s[0] -str q5, [x0, #224] -ldr q5, [x0, #256] -ldr q16, [x0, #384] -mul v27.4S, v27.4S,v22.s[0] -sub v24.4s, v5.4s, v21.4s -ldr q25, [x0, #272] -mul v13.4S, v13.4S,v22.s[0] -add v5.4s, v5.4s, v21.4s -ldr q21, [x0, #400] -mla v27.4S, v6.4S, v31.s[0] -sub v6.4s, v25.4s, v10.4s -ldr q9, [x0, #320] -mla v13.4S, v29.4S, v31.s[0] -add v25.4s, v25.4s, v10.4s -ldr q10, [x0, #448] -mul v30.4S, v30.4S,v20.s[0] -sub v29.4s, v9.4s, v26.4s -ldr q19, [x0, #336] -mul v12.4S, v12.4S,v20.s[0] -add v9.4s, v9.4s, v26.4s -ldr q26, [x0, #464] -mla v30.4S, v3.4S, v31.s[0] -mla v12.4S, v28.4S, v31.s[0] -sub v28.4s, v19.4s, v18.4s -sqrdmulh v3.4S, v25.4S, v0.s[1] -add v19.4s, v19.4s, v18.4s -mul v25.4S, v25.4S,v23.s[1] -sqrdmulh v18.4S, v6.4S, v0.s[2] -sub v7.4s, v16.4s, v27.4s -mul v6.4S, v6.4S,v23.s[2] -add v16.4s, v16.4s, v27.4s -sqrdmulh v0.4S, v19.4S, v11.s[1] -sub v23.4s, v21.4s, v13.4s -mul v19.4S, v19.4S,v1.s[1] -add v21.4s, v21.4s, v13.4s -sqrdmulh v13.4S, v28.4S, v11.s[2] -sub v27.4s, v10.4s, v30.4s -mul v28.4S, v28.4S,v1.s[2] -add v10.4s, v10.4s, v30.4s -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v26.4s, v12.4s -ldr q11, [x0, #736] -sqrdmulh v1.4S, v21.4S, v15.s[1] -add v26.4s, v26.4s, v12.4s -mla v6.4S, v18.4S, v31.s[0] -ldr q18, [x0, #672] -sqrdmulh v12.4S, v23.4S, v15.s[2] -sub v30.4s, v5.4s, v25.4s -mla v19.4S, v0.4S, v31.s[0] -ldr q0, [x0, #544] -sqrdmulh v4.4S, v26.4S, v2.s[1] -add v5.4s, v5.4s, v25.4s -str q30, [x0, #272] -mla v28.4S, v13.4S, v31.s[0] -ldr q13, [x17, #+384] -ldr q30, [x17, #+400] -sqrdmulh v25.4S, v3.4S, v2.s[2] -sub v8.4s, v24.4s, v6.4s -str q5, [x0, #256] -mul v21.4S, v21.4S,v22.s[1] -add v24.4s, v24.4s, v6.4s -mul v23.4S, v23.4S,v22.s[2] -str q8, [x0, #304] -mla v21.4S, v1.4S, v31.s[0] -sub v1.4s, v9.4s, v19.4s -mla v23.4S, v12.4S, v31.s[0] -str q24, [x0, #288] -mul v26.4S, v26.4S,v20.s[1] -str q1, [x0, #336] -mul v3.4S, v3.4S,v20.s[2] -add v9.4s, v9.4s, v19.4s -str q9, [x0, #320] -mla v26.4S, v4.4S, v31.s[0] -sub v4.4s, v29.4s, v28.4s -str q4, [x0, #368] -mla v3.4S, v25.4S, v31.s[0] -add v29.4s, v29.4s, v28.4s -str q29, [x0, #352] -sqrdmulh v2.4S, v0.4S, v30.s[0] -sub v20.4s, v16.4s, v21.4s -mul v0.4S, v0.4S,v13.s[0] -str q20, [x0, #400] -ldr q20, [x0, #560] -sqrdmulh v29.4S, v20.4S, v30.s[0] -add v16.4s, v16.4s, v21.4s -mul v20.4S, v20.4S,v13.s[0] -str q16, [x0, #384] -ldr q16, [x17, #+416] -ldr q21, [x17, #+432] -ldr q28, [x0, #608] -sqrdmulh v25.4S, v28.4S, v21.s[0] -sub v4.4s, v7.4s, v23.4s -mul v28.4S, v28.4S,v16.s[0] -str q4, [x0, #432] -ldr q4, [x0, #624] -sqrdmulh v9.4S, v4.4S, v21.s[0] -add v7.4s, v7.4s, v23.4s -mul v4.4S, v4.4S,v16.s[0] -str q7, [x0, #416] -ldr q7, [x17, #+448] -ldr q23, [x17, #+464] -mla v0.4S, v2.4S, v31.s[0] -sub v2.4s, v10.4s, v26.4s -sqrdmulh v19.4S, v18.4S, v23.s[0] -str q2, [x0, #464] -ldr q2, [x0, #688] -mla v20.4S, v29.4S, v31.s[0] -add v10.4s, v10.4s, v26.4s -sqrdmulh v26.4S, v2.4S, v23.s[0] -str q10, [x0, #448] -ldr q10, [x17, #+480] -ldr q29, [x17, #+496] -mla v28.4S, v25.4S, v31.s[0] -sub v25.4s, v27.4s, v3.4s -sqrdmulh v1.4S, v11.4S, v29.s[0] -str q25, [x0, #496] -ldr q25, [x0, #752] -mla v4.4S, v9.4S, v31.s[0] -add v27.4s, v27.4s, v3.4s -sqrdmulh v3.4S, v25.4S, v29.s[0] -str q27, [x0, #480] -ldr q27, [x0, #512] -ldr q9, [x0, #640] -mul v18.4S, v18.4S,v7.s[0] -sub v15.4s, v27.4s, v0.4s -ldr q22, [x0, #528] -mul v2.4S, v2.4S,v7.s[0] -add v27.4s, v27.4s, v0.4s -ldr q0, [x0, #656] -mla v18.4S, v19.4S, v31.s[0] -sub v19.4s, v22.4s, v20.4s -ldr q24, [x0, #576] -mla v2.4S, v26.4S, v31.s[0] -add v22.4s, v22.4s, v20.4s -ldr q20, [x0, #704] -mul v11.4S, v11.4S,v10.s[0] -sub v26.4s, v24.4s, v28.4s -ldr q12, [x0, #592] -mul v25.4S, v25.4S,v10.s[0] -add v24.4s, v24.4s, v28.4s -ldr q28, [x0, #720] -mla v11.4S, v1.4S, v31.s[0] -mla v25.4S, v3.4S, v31.s[0] -sub v3.4s, v12.4s, v4.4s -sqrdmulh v1.4S, v22.4S, v30.s[1] -add v12.4s, v12.4s, v4.4s -mul v22.4S, v22.4S,v13.s[1] -sqrdmulh v4.4S, v19.4S, v30.s[2] -sub v8.4s, v9.4s, v18.4s -mul v19.4S, v19.4S,v13.s[2] -add v9.4s, v9.4s, v18.4s -sqrdmulh v30.4S, v12.4S, v21.s[1] -sub v13.4s, v0.4s, v2.4s -mul v12.4S, v12.4S,v16.s[1] -add v0.4s, v0.4s, v2.4s -sqrdmulh v2.4S, v3.4S, v21.s[2] -sub v18.4s, v20.4s, v11.4s -mul v3.4S, v3.4S,v16.s[2] -add v20.4s, v20.4s, v11.4s -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v28.4s, v25.4s -ldr q21, [x0, #992] -sqrdmulh v16.4S, v0.4S, v23.s[1] -add v28.4s, v28.4s, v25.4s -mla v19.4S, v4.4S, v31.s[0] -ldr q4, [x0, #928] -sqrdmulh v25.4S, v13.4S, v23.s[2] -sub v11.4s, v27.4s, v22.4s -mla v12.4S, v30.4S, v31.s[0] -ldr q30, [x0, #800] -sqrdmulh v6.4S, v28.4S, v29.s[1] -add v27.4s, v27.4s, v22.4s -str q11, [x0, #528] -mla v3.4S, v2.4S, v31.s[0] -ldr q2, [x17, #+512] -ldr q11, [x17, #+528] -sqrdmulh v22.4S, v1.4S, v29.s[2] -sub v5.4s, v15.4s, v19.4s -str q27, [x0, #512] -mul v0.4S, v0.4S,v7.s[1] -add v15.4s, v15.4s, v19.4s -mul v13.4S, v13.4S,v7.s[2] -str q5, [x0, #560] -mla v0.4S, v16.4S, v31.s[0] -sub v16.4s, v24.4s, v12.4s -mla v13.4S, v25.4S, v31.s[0] -str q15, [x0, #544] -mul v28.4S, v28.4S,v10.s[1] -str q16, [x0, #592] -mul v1.4S, v1.4S,v10.s[2] -add v24.4s, v24.4s, v12.4s -str q24, [x0, #576] -mla v28.4S, v6.4S, v31.s[0] -sub v6.4s, v26.4s, v3.4s -str q6, [x0, #624] -mla v1.4S, v22.4S, v31.s[0] -add v26.4s, v26.4s, v3.4s -str q26, [x0, #608] -sqrdmulh v29.4S, v30.4S, v11.s[0] -sub v10.4s, v9.4s, v0.4s -mul v30.4S, v30.4S,v2.s[0] -str q10, [x0, #656] -ldr q10, [x0, #816] -sqrdmulh v26.4S, v10.4S, v11.s[0] -add v9.4s, v9.4s, v0.4s -mul v10.4S, v10.4S,v2.s[0] -str q9, [x0, #640] -ldr q9, [x17, #+544] -ldr q0, [x17, #+560] -ldr q3, [x0, #864] -sqrdmulh v22.4S, v3.4S, v0.s[0] -sub v6.4s, v8.4s, v13.4s -mul v3.4S, v3.4S,v9.s[0] -str q6, [x0, #688] -ldr q6, [x0, #880] -sqrdmulh v24.4S, v6.4S, v0.s[0] -add v8.4s, v8.4s, v13.4s -mul v6.4S, v6.4S,v9.s[0] -str q8, [x0, #672] -ldr q8, [x17, #+576] -ldr q13, [x17, #+592] -mla v30.4S, v29.4S, v31.s[0] -sub v29.4s, v20.4s, v28.4s -sqrdmulh v12.4S, v4.4S, v13.s[0] -str q29, [x0, #720] -ldr q29, [x0, #944] -mla v10.4S, v26.4S, v31.s[0] -add v20.4s, v20.4s, v28.4s -sqrdmulh v28.4S, v29.4S, v13.s[0] -str q20, [x0, #704] -ldr q20, [x17, #+608] -ldr q26, [x17, #+624] -mla v3.4S, v22.4S, v31.s[0] -sub v22.4s, v18.4s, v1.4s -sqrdmulh v16.4S, v21.4S, v26.s[0] -str q22, [x0, #752] -ldr q22, [x0, #1008] -mla v6.4S, v24.4S, v31.s[0] -add v18.4s, v18.4s, v1.4s -sqrdmulh v1.4S, v22.4S, v26.s[0] -str q18, [x0, #736] -ldr q18, [x0, #768] -ldr q24, [x0, #896] -mul v4.4S, v4.4S,v8.s[0] -sub v23.4s, v18.4s, v30.4s -ldr q7, [x0, #784] -mul v29.4S, v29.4S,v8.s[0] -add v18.4s, v18.4s, v30.4s -ldr q30, [x0, #912] -mla v4.4S, v12.4S, v31.s[0] -sub v12.4s, v7.4s, v10.4s -ldr q15, [x0, #832] -mla v29.4S, v28.4S, v31.s[0] -add v7.4s, v7.4s, v10.4s -ldr q10, [x0, #960] -mul v21.4S, v21.4S,v20.s[0] -sub v28.4s, v15.4s, v3.4s -ldr q25, [x0, #848] -mul v22.4S, v22.4S,v20.s[0] -add v15.4s, v15.4s, v3.4s -ldr q3, [x0, #976] -mla v21.4S, v16.4S, v31.s[0] -mla v22.4S, v1.4S, v31.s[0] -sub v1.4s, v25.4s, v6.4s -sqrdmulh v16.4S, v7.4S, v11.s[1] -add v25.4s, v25.4s, v6.4s -mul v7.4S, v7.4S,v2.s[1] -sqrdmulh v6.4S, v12.4S, v11.s[2] -sub v5.4s, v24.4s, v4.4s -mul v12.4S, v12.4S,v2.s[2] -add v24.4s, v24.4s, v4.4s -sqrdmulh v11.4S, v25.4S, v0.s[1] -sub v2.4s, v30.4s, v29.4s -mul v25.4S, v25.4S,v9.s[1] -add v30.4s, v30.4s, v29.4s -sqrdmulh v29.4S, v1.4S, v0.s[2] -sub v4.4s, v10.4s, v21.4s -mul v1.4S, v1.4S,v9.s[2] -add v10.4s, v10.4s, v21.4s -mla v7.4S, v16.4S, v31.s[0] -sub v16.4s, v3.4s, v22.4s -sqrdmulh v0.4S, v30.4S, v13.s[1] -add v3.4s, v3.4s, v22.4s -mla v12.4S, v6.4S, v31.s[0] -sqrdmulh v6.4S, v2.4S, v13.s[2] -sub v22.4s, v18.4s, v7.4s -mla v25.4S, v11.4S, v31.s[0] -sqrdmulh v11.4S, v3.4S, v26.s[1] -add v18.4s, v18.4s, v7.4s -str q22, [x0, #784] -mla v1.4S, v29.4S, v31.s[0] -sqrdmulh v29.4S, v16.4S, v26.s[2] -sub v22.4s, v23.4s, v12.4s -str q18, [x0, #768] -mul v30.4S, v30.4S,v8.s[1] -add v23.4s, v23.4s, v12.4s -mul v2.4S, v2.4S,v8.s[2] -str q22, [x0, #816] -mla v30.4S, v0.4S, v31.s[0] -sub v0.4s, v15.4s, v25.4s -mla v2.4S, v6.4S, v31.s[0] -str q23, [x0, #800] -mul v3.4S, v3.4S,v20.s[1] -str q0, [x0, #848] -mul v16.4S, v16.4S,v20.s[2] -add v15.4s, v15.4s, v25.4s -str q15, [x0, #832] -mla v3.4S, v11.4S, v31.s[0] -sub v11.4s, v28.4s, v1.4s -str q11, [x0, #880] -mla v16.4S, v29.4S, v31.s[0] -add v28.4s, v28.4s, v1.4s -str q28, [x0, #864] -sub v26.4s, v24.4s, v30.4s -str q26, [x0, #912] -add v24.4s, v24.4s, v30.4s -str q24, [x0, #896] -sub v24.4s, v5.4s, v2.4s -str q24, [x0, #944] -add v5.4s, v5.4s, v2.4s -str q5, [x0, #928] -sub v5.4s, v10.4s, v3.4s -str q5, [x0, #976] -add v10.4s, v10.4s, v3.4s -str q10, [x0, #960] -sub v10.4s, v4.4s, v16.4s -str q10, [x0, #1008] -add v4.4s, v4.4s, v16.4s -str q4, [x0, #992] -// Restore NEON vector registers -ldp d8, d9, [sp, #16*0] -ldp d10, d11, [sp, #16*1] -ldp d12, d13, [sp, #16*2] -ldp d14, d15, [sp, #16*3] -add sp, sp, #(16*4) -// Restore GPRs -ldp x19, x20, [sp, #16*0] -ldp x21, x22, [sp, #16*1] -ldp x23, x24, [sp, #16*2] -ldp x25, x26, [sp, #16*3] -ldp x27, x28, [sp, #16*4] -ldr x29, [sp, #16*5] -add sp, sp, #(16*5+16) -ret - -// Line count: 1464 -// Instruction count: 1460 \ No newline at end of file